Implementation on new data with Bubble

In [262]:
import re
import sys
import csv
import time
import json

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
from unidecode import unidecode

from collections import defaultdict
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
STOPWORDS = '''
    het de deze
    en of om te hier nog ook al
    in van voor mee per als tot uit bij
    waar waardoor waarvan wanneer
    je uw ze zelf jezelf
    ca bijv bijvoorbeeld
    is bevat hebben kunnen mogen zullen willen
    gemaakt aanbevolen
    belangrijk belangrijke heerlijk heerlijke handig handige dagelijks dagelijkse
    gebruik allergieinformatie bijdrage smaak hoeveelheid
'''.split()

In [3]:
selected_product_nuts_with_usage = '/Users/elise/Documents/?/? data/new/product_nuts_with_usage_and_product_id.csv'

In [4]:
data_pn_usage = []

with open(selected_product_nuts_with_usage) as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in tqdm_notebook(readCSV):
        data_pn_usage.append(eval(row[0]))




In [5]:
def clean(s):
    if s is None: return None
    # @todo keep '0.50%' and the like (or extract separately) - relevant for alcohol-free
    s = unidecode(s).strip()
    s = re.sub(r'[^A-Za-z0-9\'\s]', '', s, flags=re.MULTILINE)
    s = re.sub(r'\s+', ' ', s, flags=re.MULTILINE)
    return s

In [6]:
def get_brand_name(j):
    '''Return brand name from brand_name or brand_url'''
    s = j.get('brand_name', '').strip()
    if s == '':
        s = j.get('brand_url', '').strip()
        s = re.sub(r'(\Ahttps?://(www\.)?|\Awww\.|\.\w{2,3}\/?\Z)', '', s, flags=re.MULTILINE|re.IGNORECASE)
    return s

In [7]:
def f_name(j):
    f = clean(j.get('name', '').lower())
    # strip brand from front of name, would be twice featurized
    brand_name_clean = clean(get_brand_name(j).lower())
    if brand_name_clean != '' and f.startswith(brand_name_clean):
        f = f[len(brand_name_clean):].strip()

    if f == '': return []
    return f.split()

In [8]:
def f_brand(j):
    f = clean(get_brand_name(j))

    if f == '': return []
    return ['BRN:' + f]

In [9]:
def f_first_ingredient(j):
    if 'ingredients' not in j or len(j['ingredients']) == 0: return []

    f = j['ingredients'][0].strip().lower()

    # we're more interested in whether the ingredient is composed, than its exact content
    if re.search(r'[({:;,\n]', f, flags=re.MULTILINE):
        f = '(COMPOSED)'

    f = clean(f)

    if f == '': return []
    return ['ING:' + f]

In [10]:
def tokenize(j):
    '''Returns array of tokens for product nut dict'''
    return f_name(j) + f_brand(j) + f_first_ingredient(j)

In [327]:
id_tokens = []

for j in tqdm_notebook(data_pn_usage):
    
    tokens = tokenize(j)
    tokens = [s for s in tokens if s not in set(STOPWORDS) and len(s) > 1]

    id_tokens.append({'id': j['id'], 'tokens': tokens, 'usage':j['usage'], 'product_id':j['product_id']})




- Remove doubles
- Include only those nut products if the usage appears in at least 3 products

In [12]:
# removes doubles
tokens = [' '.join(x['tokens']) for x in id_tokens]
ct = Counter(tokens)

no_doubles = []
for x in tqdm_notebook(id_tokens):
    if ct[' '.join(x['tokens'])] == 1:
        no_doubles.append(x)




In [13]:
# makes a dict where each tuple has one usage and all the product_ids linked to it
k = [{x['usage']: x['product_id']} for x in no_doubles]
dd = defaultdict(list)

for d in tqdm_notebook(k): 
    for key, value in d.iteritems():
        dd[key].append(value)




In [14]:
# checks if a usage is linked to at least 3 products
allowed_usage = []
for x in tqdm_notebook(dd.items()):
    if len(set(x[1])) > 2:
        allowed_usage.append(x[0])
        
set_allowed_usage = set(allowed_usage)

complete = []
for x in tqdm_notebook(no_doubles):
    if x['usage'] in set_allowed_usage:
        complete.append(x)





LABELS

In [230]:
labels = [x['product_id'] for x in complete]

In [297]:
len(set(labels))

51416

In [242]:
K_fold = int(len(set(labels)) / 6)

one = set(list(set(labels))[:K_fold])
two = set(list(set(labels))[K_fold:(K_fold * 2)])
three = set(list(set(labels))[(2 * K_fold):(K_fold * 3)])
four = set(list(set(labels))[(3 * K_fold):(K_fold * 4)])
five = set(list(set(labels))[(4 * K_fold):(K_fold * 5)])
six = set(list(set(labels))[(5 * K_fold):])

labels_6 = []
for label in tqdm_notebook(labels):
    if label in one:
        labels_6.append(1)
    elif label in two:
        labels_6.append(2)
    elif label in three:
        labels_6.append(3)
    elif label in four:
        labels_6.append(4)
    elif label in five:
        labels_6.append(5)
    elif label in six:
        labels_6.append(6)




X

In [232]:
text_per_item_pn_TRAIN = [' '.join(tokens['tokens']) for tokens in complete if tokens['product_id'] not in six]
text_per_item_pn_VALIDATE = [' '.join(tokens['tokens']) for tokens in complete if tokens['product_id'] in six]

In [265]:
vectorizer = CountVectorizer(min_df=1, binary=True)
X_TRAIN = vectorizer.fit_transform(text_per_item_pn_TRAIN)

In [266]:
vocab = vectorizer.vocabulary_
vectorizer = CountVectorizer(min_df=1, vocabulary=vocab)
X_VALIDATE = vectorizer.fit_transform(text_per_item_pn_VALIDATE)

Y

In [267]:
Y_TRAIN = [x['usage'] for x in complete if x['product_id'] not in six]
Y_VALIDATE = [x['usage'] for x in complete if x['product_id'] in six]

In [268]:
labels_TRAIN = []
for x in enumerate(labels_6):
    if x[1] != 6:
        labels_TRAIN.append(x[1])

In [269]:
print X_VALIDATE.shape
print len(Y_VALIDATE)
print 
print X_TRAIN.shape
print len(labels_TRAIN)
print len(Y_TRAIN)

(16277, 23776)
16277

(93180, 23776)
93180
93180


Cross_Validation

In [270]:
cv = LeaveOneLabelOut(labels_TRAIN)

clf = LinearSVC(random_state = 2, verbose = 1)
scores = cross_val_score(clf, X_TRAIN, Y_TRAIN, cv=cv, scoring='recall_macro')

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [271]:
print "Recall Macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

Recall Macro: 0.51 (+/- 0.02)


Validate

In [272]:
clf.fit(X_TRAIN, Y_TRAIN)
predictions = clf.predict(X_VALIDATE)

[LibLinear]

In [273]:
def classifaction_report_csv(report):
    report_data = []
    lines = report.encode('ascii', 'ignore').split('\n')
    for line in (lines[2:-3] + [lines[-2]]):
        row = {}
        row_data = line.strip().split('  ')
        row_data = [x for x in row_data if x != '']
        row['class'] = row_data[0]
        row['precision'] = row_data[1]
        row['recall'] = row_data[2]
        row['f1_score'] = row_data[3]
        row['support'] = row_data[4]
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    #dataframe.to_csv('classification_report.csv', index = False)
    return dataframe

report = classification_report(Y_VALIDATE, predictions)
df = classifaction_report_csv(report)

In [290]:
print 'macro recall score: ',recall_score(Y_VALIDATE, predictions, average = 'macro')
print 'micro recall score: ',recall_score(Y_VALIDATE, predictions, average = 'micro')

macro recall score:  0.551120267316
micro recall score:  0.719420040548


In [284]:
print precision_score(Y_VALIDATE, predictions, average = 'macro')
print precision_score(Y_VALIDATE, predictions, average = 'micro')
print precision_score(Y_VALIDATE, predictions, average = 'weighted')

0.502468973882
0.719420040548
0.739626941404


In [292]:
from sklearn.metrics import accuracy_score

In [295]:
print accuracy_score(Y_VALIDATE, predictions)

0.719420040548


In [277]:
df

Unnamed: 0,class,f1_score,precision,recall,support
0,Aardappel - Friet/patat,0.90,0.85,0.96,23
1,Aardappel - hele,0.75,0.67,0.86,7
2,Aardappel - zoete,1.00,1.00,1.00,1
3,Aardappelpuree,0.00,0.00,0.00,4
4,Aardappelschijfjes/krieltjes/partjes gekruid,0.68,0.56,0.88,17
5,Aardappelschijfjes/krieltjes/partjes ongekruid,0.31,0.67,0.20,10
6,Aardappelschotel,1.00,1.00,1.00,3
7,Abdijkaas,0.00,0.00,0.00,0
8,Abrikoos (gedroogd),0.00,0.00,0.00,5
9,Afslankmaaltijden,0.00,0.00,0.00,0


- Reliable Recall Macro Score

In [326]:
report_data = []
lines = report.encode('ascii', 'ignore').split('\n')
for line in (lines[2:-3] + [lines[-2]]):
    row = {}
    row_data = line.strip().split('  ')
    row_data = [x for x in row_data if x != '']
    row['class'] = row_data[0]
    row['precision'] = row_data[1]
    row['recall'] = row_data[2]
    row['f1_score'] = row_data[3]
    row['support'] = row_data[4]
    report_data.append(row)
    
recall = []
for d in report_data:
    if d['support'].strip() != '0':
        recall.append(d['recall'])
        
n_recall = [float(x) for x in recall]
print 'macro recall score: ',sum(n_recall) / len(recall)

macro recall score:  0.611822660099
