Implementation on new data with Bubble

In [1]:
import re
import sys
import csv
import time
import json

import numpy as np
import pandas as pd

from featurize import tokenize_dict
from tqdm import tqdm_notebook
from unidecode import unidecode

from collections import defaultdict
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
source = 'data/product_nuts_with_product_info.jsonl'

In [3]:
data_pn_usage = []

with open(source) as f:
    for line in tqdm_notebook(f):
        data_pn_usage.append(json.loads(line))




In [4]:
id_tokens = [tokenize_dict(d) for d in data_pn_usage]

  _warn_if_not_unicode(string)


- Remove doubles
- Include only those nut products if the usage appears in at least 3 products

In [5]:
# removes doubles
known = set()
no_doubles = []

for d in tqdm_notebook(id_tokens):
    tok = ' '.join(d['tokens'])
    if tok in known: 
        continue
    no_doubles.append(d)
    known.add(tok)




In [6]:
# makes a dict where each tuple has one usage and all the product_ids linked to it
k = [{x['usage']: x['product_id']} for x in no_doubles]
dd = defaultdict(list)

for d in tqdm_notebook(k): 
    for key, value in d.iteritems():
        dd[key].append(value)




In [7]:
# checks if a usage is linked to at least 3 products
allowed_usage = []
for x in tqdm_notebook(dd.items()):
    if len(set(x[1])) > 2:
        allowed_usage.append(x[0])
        
set_allowed_usage = set(allowed_usage)

complete = []
for x in tqdm_notebook(no_doubles):
    if x['usage'] in set_allowed_usage:
        complete.append(x)





LABELS

In [8]:
labels = [x['product_id'] for x in complete]

In [9]:
K_fold = int(len(set(labels)) / 6)

one = set(list(set(labels))[:K_fold])
two = set(list(set(labels))[K_fold:(K_fold * 2)])
three = set(list(set(labels))[(2 * K_fold):(K_fold * 3)])
four = set(list(set(labels))[(3 * K_fold):(K_fold * 4)])
five = set(list(set(labels))[(4 * K_fold):(K_fold * 5)])
six = set(list(set(labels))[(5 * K_fold):])

labels_6 = []
for label in tqdm_notebook(labels):
    if label in one:
        labels_6.append(1)
    elif label in two:
        labels_6.append(2)
    elif label in three:
        labels_6.append(3)
    elif label in four:
        labels_6.append(4)
    elif label in five:
        labels_6.append(5)
    elif label in six:
        labels_6.append(6)




X

In [10]:
text_per_item_pn_TRAIN = [' '.join(tokens['tokens']) for tokens in complete if tokens['product_id'] not in six]
text_per_item_pn_VALIDATE = [' '.join(tokens['tokens']) for tokens in complete if tokens['product_id'] in six]

In [11]:
vectorizer = CountVectorizer(min_df=1, binary=True)
X_TRAIN = vectorizer.fit_transform(text_per_item_pn_TRAIN)

In [12]:
vocab = vectorizer.vocabulary_
vectorizer = CountVectorizer(min_df=1, vocabulary=vocab)
X_VALIDATE = vectorizer.fit_transform(text_per_item_pn_VALIDATE)

Y

In [13]:
Y_TRAIN = [x['usage'] for x in complete if x['product_id'] not in six]
Y_VALIDATE = [x['usage'] for x in complete if x['product_id'] in six]

In [14]:
labels_TRAIN = []
for x in enumerate(labels_6):
    if x[1] != 6:
        labels_TRAIN.append(x[1])

In [15]:
print 'X: ', X_VALIDATE.shape, 'Y: ', len(Y_VALIDATE)
print 'X: ', X_TRAIN.shape, 'Y: ',len(Y_TRAIN), 'Labels: ', len(labels_TRAIN)

X:  (21021, 26314) Y:  21021
X:  (119050, 26314) Y:  119050 Labels:  119050


Cross_Validation

In [16]:
cv = LeaveOneLabelOut(labels_TRAIN)

clf = LinearSVC(random_state = 2, verbose = 1)
scores = cross_val_score(clf, X_TRAIN, Y_TRAIN, cv=cv, scoring='recall_macro')

[LibLinear]

  'recall', 'true', average, warn_for)


[LibLinear][LibLinear][LibLinear][LibLinear]

In [17]:
print "Recall Macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

Recall Macro: 0.51 (+/- 0.01)


Validate

In [18]:
clf.fit(X_TRAIN, Y_TRAIN)
predictions = clf.predict(X_VALIDATE)

[LibLinear]

In [19]:
def classifaction_report_csv(report):
    report_data = []
    lines = report.encode('ascii', 'ignore').split('\n')
    for line in (lines[2:-3] + [lines[-2]]):
        row = {}
        row_data = line.strip().split('  ')
        row_data = [x for x in row_data if x != '']
        row['class'] = row_data[0]
        row['precision'] = row_data[1]
        row['recall'] = row_data[2]
        row['f1_score'] = row_data[3]
        row['support'] = row_data[4]
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    #dataframe.to_csv('classification_report.csv', index = False)
    return dataframe

report = classification_report(Y_VALIDATE, predictions)
df = classifaction_report_csv(report)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [20]:
print 'macro recall score: ',recall_score(Y_VALIDATE, predictions, average = 'macro')
print 'micro recall score: ',recall_score(Y_VALIDATE, predictions, average = 'micro')

macro recall score:  0.540332692954
micro recall score:  0.728652300081


- Reliable Recall Macro Score

In [21]:
report_data = []
lines = report.encode('ascii', 'ignore').split('\n')
for line in (lines[2:-3] + [lines[-2]]):
    row = {}
    row_data = line.strip().split('  ')
    row_data = [x for x in row_data if x != '']
    row['class'] = row_data[0]
    row['precision'] = row_data[1]
    row['recall'] = row_data[2]
    row['f1_score'] = row_data[3]
    row['support'] = row_data[4]
    report_data.append(row)
    
recall = []
for d in report_data:
    if d['support'].strip() != '0':
        recall.append(d['recall'])
        
n_recall = [float(x) for x in recall]
print 'macro recall score: ',sum(n_recall) / len(recall)

macro recall score:  0.613273809524
