Implementation on new data 

In [1]:
import re
import sys
import csv
import time
import json

import numpy as np
import pandas as pd

from featurize import tokenize_dict
from tqdm import tqdm_notebook
from unidecode import unidecode

from collections import defaultdict
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
source = 'data/product_nuts_with_product_info.jsonl'

In [3]:
data_pn_usage = []

with open(source) as f:
    for line in tqdm_notebook(f):
        data_pn_usage.append(json.loads(line))




In [4]:
id_tokens = [tokenize_dict(d) for d in data_pn_usage]




What needs to be done

- remove doubles from nuts products
- check that we only include nuts product with wich the usage is linked to at least three products

In [5]:
# removes doubles
known = set()
no_doubles = []

for d in tqdm_notebook(id_tokens):
    tok = ' '.join(d['tokens'])
    if tok in known: 
        continue
    no_doubles.append(d)
    known.add(tok)




In [6]:

# makes a dict where each tuple has one usage and all the product_ids linked to it
k = [{x['usage']: x['product_id']} for x in no_doubles]
dd = defaultdict(list)

for d in tqdm_notebook(k): 
    for key, value in d.iteritems():
        dd[key].append(value)




In [7]:
# checks if a usage is linked to at least 3 products
allowed_usage = []
for x in tqdm_notebook(dd.items()):
    if len(set(x[1])) > 2:
        allowed_usage.append(x[0])
        
set_allowed_usage = set(allowed_usage)

complete = []
for x in tqdm_notebook(no_doubles):
    if x['usage'] in set_allowed_usage:
        complete.append(x)





X

In [8]:
text_per_item_pn = [' '.join(tokens['tokens']) for tokens in complete]

In [9]:
vectorizer = CountVectorizer(min_df=1, binary=True)
X = vectorizer.fit_transform(text_per_item_pn)

Y

In [10]:
Y = [x['usage'] for x in complete]

Cross-Validation

In [11]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2, random_state=7)

In [12]:
print 'X: ', X.shape, 'Y: ', len(Y)
print 'X: ', X_train.shape, 'Y: ', len(Y_train)
print 'X: ', X_validation.shape, ' Y: ', len(Y_validation)

X:  (140071, 29395) Y:  140071
X:  (112056, 29395) Y:  112056
X:  (28015, 29395)  Y:  28015


In [13]:
clf = LinearSVC(random_state = 2, verbose = 1)
scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='recall_macro') 



[LibLinear][LibLinear][LibLinear]

  'recall', 'true', average, warn_for)


[LibLinear][LibLinear]

In [14]:
print "recall macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

recall macro: 0.77 (+/- 0.02)


Validate

In [15]:
clf.fit(X_train, Y_train)
predictions = clf.predict(X_validation)

[LibLinear]

In [16]:
def classifaction_report_csv(report):
    report_data = []
    lines = report.encode('ascii', 'ignore').split('\n')
    #for line in (lines[2:-3] + [lines[-2]]):
    for line in (lines[2:-3] + [lines[-2]]):
        row = {}
        row_data = line.strip().split('  ')
        row_data = [x for x in row_data if x != '']
        row['class'] = row_data[0]
        row['precision'] = row_data[1]
        row['recall'] = row_data[2]
        row['f1_score'] = row_data[3]
        row['support'] = row_data[4]
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    #dataframe.to_csv('classification_report.csv', index = False)
    return dataframe

report = classification_report(Y_validation, predictions)
df = classifaction_report_csv(report)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [17]:
print 'macro recall score: ', recall_score(Y_validation, predictions, average = 'macro')
print 'micro recall score: ', recall_score(Y_validation, predictions, average = 'micro')

macro recall score:  0.77511104049
micro recall score:  0.859539532393


- Reliable Recall Macro Score

In [18]:
report_data = []
lines = report.encode('ascii', 'ignore').split('\n')
for line in (lines[2:-3] + [lines[-2]]):
    row = {}
    row_data = line.strip().split('  ')
    row_data = [x for x in row_data if x != '']
    row['class'] = row_data[0]
    row['precision'] = row_data[1]
    row['recall'] = row_data[2]
    row['f1_score'] = row_data[3]
    row['support'] = row_data[4]
    report_data.append(row)
    
    
recall = []
for d in report_data:
    if d['support'].strip() != '0':
        recall.append(d['recall'])
        
n_recall = [float(x) for x in recall]
print 'macro recall score: ', sum(n_recall) / len(recall)

macro recall score:  0.788133086876
