## W266 Transfer learning Project by Arunima Kayath, Anamika Sinha

In [1]:
import pandas as pd
import gzip
import time
# Install a few python packages using pip
from common import utils
utils.require_package('nltk')
utils.require_package("wget")      # for fetching dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_curve, auc
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

In [2]:
# Standard python helper libraries.
from __future__ import print_function
from __future__ import division
import os, sys, time
import collections
import itertools

# Numerical manipulation libraries.
import numpy as np
from scipy import stats, optimize

import nltk
nltk.download('punkt')
from nltk import word_tokenize

# Helper libraries
from common import utils, vocabulary
#import segment

[nltk_data] Downloading package punkt to
[nltk_data]     /home/reachanamikasinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Function to read the amazon review data files
def parse(path):
  print('start parse')
  start_parse = time.time()
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)
  end_parse = time.time()
  print('end parse with time for parse',end_parse - start_parse)

def getDF(path):
  print('start getDF')
  start = time.time()
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  print('end getDF')
  end = time.time()
  print('time taken to load data = ',end-start)
  return pd.DataFrame.from_dict(df, orient='index')
#df = getDF('reviews_Toys_and_Games.json.gz') #old def function corresponding to the step bt step vectorization

In [4]:
df_vid = getDF('reviews_Video_Games.json.gz')
df_toys = getDF('reviews_Toys_and_Games.json.gz')

start getDF
start parse
end parse with time for parse 70.40680766105652
end getDF
time taken to load data =  70.40703272819519
start getDF
start parse
end parse with time for parse 107.14671397209167
end getDF
time taken to load data =  107.14689898490906


In [5]:
df_aut = getDF('reviews_Automotive.json.gz')

start getDF
start parse
end parse with time for parse 61.8460009098053
end getDF
time taken to load data =  61.84651064872742


In [6]:
df_hnk = getDF('reviews_Home_and_Kitchen.json.gz')

start getDF
start parse
end parse with time for parse 209.0273153781891
end getDF
time taken to load data =  209.0276162624359


### Preparing data for modeling. 
Train,dev,test split.
Create similar sized data subsets.

In [7]:
#Create train,dev,test split
from sklearn.model_selection import train_test_split
train_toys,devtest = train_test_split(df_toys, test_size=0.4, random_state=42)
dev_toys,test_toys = train_test_split(devtest,test_size = 0.5,random_state=42)
print('Toy reviews train, dev and test set dataframe shape:',train_toys.shape,dev_toys.shape,test_toys.shape)

#For Video games reviews
train_vid,devtest = train_test_split(df_vid, test_size=0.4, random_state=42)
dev_vid,test_vid = train_test_split(devtest,test_size = 0.5, random_state=42)
print('Video games reviews train, dev and test set dataframe shape:',train_vid.shape,dev_vid.shape,test_vid.shape)

#For Auto reviews
train_aut,devtest = train_test_split(df_aut, test_size=0.4, random_state=42)
dev_aut,test_aut = train_test_split(devtest,test_size = 0.5, random_state=42)
print('Auto reviews train, dev and test set dataframe shape:',train_aut.shape,dev_aut.shape,test_aut.shape)

#For Home and Kitchen reviews
train_hnk,devtest = train_test_split(df_hnk, test_size=0.4, random_state=42)
dev_hnk,test_hnk = train_test_split(devtest,test_size = 0.5, random_state=42)
print('Home and Kitchen reviews train, dev and test set dataframe shape:',train_hnk.shape,dev_hnk.shape,test_hnk.shape)

Toy reviews train, dev and test set dataframe shape: (1351662, 9) (450554, 9) (450555, 9)
Video games reviews train, dev and test set dataframe shape: (794851, 9) (264951, 9) (264951, 9)
Auto reviews train, dev and test set dataframe shape: (824260, 9) (274754, 9) (274754, 9)
Home and Kitchen reviews train, dev and test set dataframe shape: (2552355, 9) (850785, 9) (850786, 9)


In [8]:
#Function to create a smaller sized train and dev data set. Enables testing accuracy for different sizes.
#Also binarizes the labels. Ratings of 1,2 and to 0; Ratings of 4,5 to 1.

def set_df_size(size,data_train,data_dev):
    size_train = size
    len_max_train = data_train[data_train.overall!=3].shape[0] #max possible length of train data set taking out the 3 ratings.
    #print("Number of reviews with ratings != 3 in train set",len_max_train)
    temp_size_train = min(len_max_train,size_train)

    len_max_dev = data_dev[data_dev.overall!=3].shape[0]
    #print("Number of reviews with ratings != 3 in dev set",len_max_dev)
    temp_size_dev = min(len_max_dev,int(0.3*temp_size_train)) #making the dev set about 0.3 times the train set.

    temp_train_data = data_train[data_train.overall != 3][:temp_size_train]
    #print('Size of train data',temp_train_data.shape)
    #print(temp_train_data.groupby('overall').count())
    #print(temp_train_toys[:5])

    temp_dev_data = data_dev[data_dev.overall!=3][:temp_size_dev]
    #print('Size of dev data',temp_dev_data.shape)
    #print(temp_dev_data.groupby('overall').count())
    #print(temp_dev_data[:2])
    
    #Binarize ratings
    temp_train_y = np.zeros(temp_size_train)
    temp_train_y[temp_train_data.overall > 3] = 1
    temp_dev_y = np.zeros(temp_size_dev)
    temp_dev_y[temp_dev_data.overall>3] = 1
    #print('binarized y shape',temp_train_y.shape,temp_dev_y.shape)
    #print(temp_dev_y[:20],data_dev.overall[:20])
    return temp_train_data,temp_dev_data,temp_train_y,temp_dev_y

# Straight go to Cell 13 if you want to run on different sample sizes 

In [10]:
#Create smaller dataframes of desired size = size_train for each dataset, and binarize the ratings.
list_df = ['toys','vid','aut','hnk'] #list of keys that refer to each dataframe. Adding a new dataframe would require updating this list
size_train = 150000 #Set size of train set here. This is a hyperparameter.

dict_train_df = {} #Dict to store train input data frame for each domain, can be accessed by using domain name as key
dict_dev_df = {} #Dict to store dev input data frame for each domain, can be accessed by using domain name as key
dict_train_y = {} #Dict to store binarized train data label for each domain
dict_dev_y = {} #Dict to store binarized dev data label for each domain
key = list_df[0]
print('Toys reviews\n')
dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_toys,dev_toys)
print('\n Video games reviews\n')
key = list_df[1]
dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_vid,dev_vid)
print('\n Auto reviews\n')
key = list_df[2]
dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_aut,dev_aut)
print('\n Home and Kitchen reviews\n')
key = list_df[3]
dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_hnk,dev_hnk)

Toys reviews


 Video games reviews


 Auto reviews


 Home and Kitchen reviews



### Naive Bayes Baseline.
Note : First the reviews are converted to word id sparse vectors using CountVectorizer. Then we apply the Naive Bayes model to this.   
The vocabulary,words ids (using CountVectorizer) and the Naive Bayes model is created using the source domain, and then applied to the target domain.    
There is an argument for creating the vocabulary, word ids using both domains since that only uses the unlabeled data, and then fit the Naive Bayes model using only the source domain.   
We may try that in the future. Would love input on whether it is worth trying that.   

In [12]:
#Converting reviews to sparse matrix of word ids with count vectorizer, and using Naive Bayes to make the prediction.
#This section also creates the count_vectorizer and Naive Bayes models for each domain to be used to test transfer learning

from sklearn.naive_bayes import MultinomialNB
dict_vectorizers = {} #Dict to store the count_vectorizer model developed on each domain
dict_train_ids = {} #Dict to store train data reviews as sparse matrix of word ids
dict_dev_ids = {} #Dict to store dev data reviews as sparse matrix of word ids
dict_nb = {} #Dict to store naive bayes model developed on each domain. Assumes input features are developed using the corresponding count_vectorizer
dict_dev_ypred = {} #Dict to store dev predictions
for key in list_df:
    
    #Converting ratings to tokenized word id counts as a sparse matrix using count_vectorizer
    dict_vectorizers[key] = CountVectorizer()
    dict_train_ids[key] = dict_vectorizers[key].fit_transform(dict_train_df[key].reviewText)
    dict_dev_ids[key] = dict_vectorizers[key].transform(dict_dev_df[key].reviewText)
    print("Number words in training corpus for",key,len(dict_vectorizers[key].get_feature_names()))
    print(key,'dataset id shapes',dict_train_ids[key].shape, dict_dev_ids[key].shape)
    
    #Building a Naive Bayes model to predict the ratings
    dict_nb[key] = MultinomialNB()
    dict_nb[key].fit(dict_train_ids[key],dict_train_y[key])
    dict_dev_ypred[key] = dict_nb[key].predict(dict_dev_ids[key])
    acc = accuracy_score(dict_dev_y[key], dict_dev_ypred[key])
    print("Accuracy on",key,"dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))
    print('Corresponding classification report\n',classification_report(dict_dev_y[key], dict_dev_ypred[key]))

Number words in training corpus for toys 63968
toys dataset id shapes (100000, 63968) (30000, 63968)
Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.34%
Corresponding classification report
              precision    recall  f1-score   support

        0.0       0.74      0.75      0.74      4465
        1.0       0.96      0.95      0.95     25535

avg / total       0.92      0.92      0.92     30000

Number words in training corpus for vid 99366
vid dataset id shapes (100000, 99366) (30000, 99366)
Accuracy on vid dev set for binary prediction with toys naive bayes model: 89.32%
Corresponding classification report
              precision    recall  f1-score   support

        0.0       0.72      0.70      0.71      5642
        1.0       0.93      0.94      0.93     24358

avg / total       0.89      0.89      0.89     30000

Number words in training corpus for aut 59113
aut dataset id shapes (100000, 59113) (30000, 59113)
Accuracy on aut dev set for bina

# Next two cells are Naive bayes transfer learning  in baseline version without count vectorizer min_df=5, max_df=0.8, stop_words='english'

In [47]:

#Accuracy of transfer learning

# dict_transfer_ids = {} #Dictionary to store the dev vector ids for dataframe A(df) using the count_vectorizer of dataframe B(vect)
# transfer_results = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store accuracy on transfer. Col = Model, row = dataframe
# for vectKey in list_df:
#     dict_transfer_ids[vectKey] = {}
#     #print('vectKey',vectKey)
#     for dfKey in list_df:
#         #print('dfKey',dfKey)
#         dict_transfer_ids[vectKey][dfKey] = dict_vectorizers[vectKey].transform(dict_dev_df[dfKey].reviewText)
#         #print(dfKey,'dataset using ',vectKey,' count vectorizer, id shapes',dict_transfer_ids[vectKey][dfKey].shape)
#         dict_dev_ypred = dict_nb[vectKey].predict(dict_transfer_ids[vectKey][dfKey])
#         acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
#         #print("Accuracy on ",dfKey," dev set for binary prediction with ", vectKey," naive bayes model: {:.02%}".format(acc))
#         transfer_results[vectKey][dfKey] = acc

# print("Effectiveness of transfer learning with Naive Bayes:")
# print("Accuracy of rating predictions")
# print("Colums = source domain, Rows = target domain\n")
# print(transfer_results.to_string(float_format = '{:.01%}'.format))

Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 92.3% 91.4% 91.2% 91.3%
vid  87.0% 89.3% 87.8% 87.3%
aut  75.6% 79.1% 92.1% 84.0%
hnk  84.9% 85.8% 91.1% 91.6%


In [48]:
# #Calculating and displaying as transfer loss
# transfer_loss = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store loss in accuracy on transfer. Col = Model, row = dataframe
# for A in list_df:
#     for B in list_df:
#         transfer_loss[A][B] = transfer_results[B][B] - transfer_results[A][B]
# print("Transfer loss on rating predictions")
# print("Colums = source domain, Rows = target domain\n")
# print(transfer_loss.to_string(float_format = '{:.01%}'.format))

Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid  aut  hnk
toys  0.0%  0.9% 1.1% 1.0%
vid   2.3%  0.0% 1.5% 2.1%
aut  16.4% 12.9% 0.0% 8.0%
hnk   6.7%  5.8% 0.5% 0.0%


# Start here if you want to run different sample sizes. Below code is for error analysis  with different sample sizes. It does not have min_df=5, max_df=0.8, stop_words='english'

In [11]:
list_df = ['toys','vid','aut','hnk'] #list of keys that refer to each dataframe. Adding a new dataframe would require updating this list
dict_train_df = {} #Dict to store train input data frame for each domain, can be accessed by using domain name as key
dict_dev_df = {} #Dict to store dev input data frame for each domain, can be accessed by using domain name as key
dict_train_y = {} #Dict to store binarized train data label for each domain
dict_dev_y = {} #Dict to store binarized dev data label for each domain
#print(len(dict_train_df))

def create_sized_data(size = 100000):
    size_train = size #Set size of train set here. This is a hyperparameter.
    key = list_df[0]
    #print('Toys reviews\n')
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_toys,dev_toys)
    #print('\n Video games reviews\n')
    key = list_df[1]
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_vid,dev_vid)
    #print('\n Auto reviews\n')
    key = list_df[2]
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_aut,dev_aut)
    #print('\n Home and Kitchen reviews\n')
    key = list_df[3]
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_hnk,dev_hnk)
    
create_sized_data(150000)
#print(len(dict_train_df))

In [15]:
#Converting reviews to sparse matrix of word ids with count vectorizer, and using Naive Bayes to make the prediction.
#This section also creates the count_vectorizer and Naive Bayes models for each domain to be used to test transfer learning

from sklearn.naive_bayes import MultinomialNB
dict_vectorizers = {} #Dict to store the count_vectorizer model developed on each domain
dict_train_ids = {} #Dict to store train data reviews as sparse matrix of word ids
dict_dev_ids = {} #Dict to store dev data reviews as sparse matrix of word ids
dict_nb = {} #Dict to store naive bayes model developed on each domain. Assumes input features are developed using the corresponding count_vectorizer
dict_dev_ypred = {} #Dict to store dev predictions
dict_dev_ypred_proba = {} ##Dict to store dev predictions probablities
def create_base_NB_models():
    for key in list_df:

        #Converting ratings to tokenized word id counts as a sparse matrix using count_vectorizer
        dict_vectorizers[key] = CountVectorizer(min_df=5, max_df=0.8, stop_words='english')
        dict_train_ids[key] = dict_vectorizers[key].fit_transform(dict_train_df[key].reviewText)
        dict_dev_ids[key] = dict_vectorizers[key].transform(dict_dev_df[key].reviewText)
        #print("Number words in training corpus for",key,len(dict_vectorizers[key].get_feature_names()))
        #print(key,'dataset id shapes',dict_train_ids[key].shape, dict_dev_ids[key].shape)

        #Building a Naive Bayes model to predict the ratings
        dict_nb[key] = MultinomialNB()
        dict_nb[key].fit(dict_train_ids[key],dict_train_y[key])
        dict_dev_ypred[key] = dict_nb[key].predict(dict_dev_ids[key])
        acc = accuracy_score(dict_dev_y[key], dict_dev_ypred[key])
        #print("Accuracy on",key,"dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))

def print_base_NB_details():
    for key in list_df:
      print('Classification report for',key,'\n',classification_report(dict_dev_y[key], dict_dev_ypred[key]))  
        
#create_base_NB_models()

In [12]:
#Define Sample size and create dataframes to store  transfer metrics results

#Sample_size = [500000, 100000]
Sample_size = [10000, 50000,100000, 200000]
num_classes = [0,1]
list_df = ['toys','vid','aut','hnk']
list_auc_df = deepcopy(list_df)
list_auc_df.insert(0,"Sample_size")
list_auc_df.insert(1,"Target_Domain")


list_acc_df = deepcopy(list_df)
list_acc_df.insert(0,"Sample_size")
list_acc_df.insert(1,"Target_Domain")


list_f1_df = deepcopy(list_df)
list_f1_df.insert(0,"Sample_size")
list_f1_df.insert(1,"Target_Domain")
list_f1_df.insert(2,"Class")

print(list_auc_df )
print(list_acc_df )
print(list_f1_df )
    
len_auc_df = len(Sample_size)*len(list_df)
index_auc_df = list(range(1, len_auc_df+1))
print(index_auc_df)
    
len_acc_df = len(Sample_size)*len(list_df)
index_acc_df = list(range(1, len_acc_df+1))
print(index_acc_df)

len_f1_df = len(Sample_size)*len(list_df)*len(num_classes)
index_f1_df = list(range(1, len_f1_df+1))
print(index_f1_df)

transfer_auc_results = pd.DataFrame(index=index_auc_df,columns=list_auc_df) #Dataframe to store auc on transfer. Col = Model, row = dataframe
transfer_acc_results = pd.DataFrame(index=index_acc_df,columns=list_acc_df) #Dataframe to store accuracy on transfer. 
transfer_f1_results = pd.DataFrame(index=index_f1_df,columns=list_f1_df) #Dataframe to store accuracy on transfer. 
#print(transfer_auc_results)
#print(transfer_f1_results)

transfer_auc_loss = pd.DataFrame(index=index_auc_df,columns=list_auc_df) #Dataframe to store auc on transfer. Col = Model, row = dataframe
transfer_acc_loss = pd.DataFrame(index=index_acc_df,columns=list_acc_df) #Dataframe to store acc on transfer. 
transfer_f1_loss = pd.DataFrame(index=index_f1_df,columns=list_f1_df) #Dataframe to store f1 on transfer. 


    


['Sample_size', 'Target_Domain', 'toys', 'vid', 'aut', 'hnk']
['Sample_size', 'Target_Domain', 'toys', 'vid', 'aut', 'hnk']
['Sample_size', 'Target_Domain', 'Class', 'toys', 'vid', 'aut', 'hnk']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]


In [13]:
#Transfer learning error analysis using AUC, Accuracy and f1 metrics

dict_transfer_ids = {} #Dictionary to store the dev vector ids for dataframe A(df) using the count_vectorizer of dataframe B(vect)




def estimate_transfer_accuracy(size=5000):
    
#     for n in range(1,len(list_df)+1):
#         transfer_auc_results.set_value(index_auc_df[n], list_auc_df[0], size)
    
    for vectKey in list_df:
        
        dict_transfer_ids[vectKey] = {}
        #dict_transfer_results[vectKey] = {}
        

        n = Sample_size.index(size)*len(list_df)
        j = Sample_size.index(size)*len(list_df)*len(num_classes)
                
        #print('vectKey',vectKey)
        for dfKey in list_df:
        #print('dfKey',dfKey)
        
            
            dict_transfer_ids[vectKey][dfKey] = dict_vectorizers[vectKey].transform(dict_dev_df[dfKey].reviewText)
            #print(dfKey,'dataset using ',vectKey,' count vectorizer, id shapes',dict_transfer_ids[vectKey][dfKey].shape)
            dict_dev_ypred = dict_nb[vectKey].predict(dict_transfer_ids[vectKey][dfKey])
            dict_dev_ypred_proba = dict_nb[vectKey].predict_proba(dict_transfer_ids[vectKey][dfKey])
            
            ##AUC Calculations
            #false_pos_rate, true_pos_rate, _ = roc_curve(dict_dev_y[dfKey], dict_dev_ypred)
            
            #roc_auc = auc(false_pos_rate, true_pos_rate)
            
            false_pos_rate, true_pos_rate, _ = roc_curve(dict_dev_y[dfKey], dict_dev_ypred_proba[:,1])
            roc_auc = auc(false_pos_rate, true_pos_rate)
            
            transfer_auc_results.set_value(index_auc_df[n], list_auc_df[0], size)
            transfer_auc_results.set_value(index_auc_df[n], list_auc_df[1], dfKey)
            transfer_auc_results.set_value(index_auc_df[n], vectKey, roc_auc)
            
            
            ##Accuracy calculations
            acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
            
            transfer_acc_results.set_value(index_auc_df[n], list_auc_df[0], size)
            transfer_acc_results.set_value(index_auc_df[n], list_auc_df[1], dfKey)
            transfer_acc_results.set_value(index_auc_df[n], vectKey, acc)
            
            ##f1 score calcullations
            f1_val_pos = (f1_score(dict_dev_y[dfKey], dict_dev_ypred,  average=None))[1]#pos class
        
            f1_val_neg = (f1_score(dict_dev_y[dfKey], dict_dev_ypred,  average=None))[0]#neg class
            transfer_f1_results.set_value(index_f1_df[j], list_f1_df[0], size)
            transfer_f1_results.set_value(index_f1_df[j], list_f1_df[1], dfKey)
            transfer_f1_results.set_value(index_f1_df[j], list_f1_df[2], "1")
            transfer_f1_results.set_value(index_f1_df[j], vectKey, f1_val_pos)
            j +=1
            transfer_f1_results.set_value(index_f1_df[j], list_f1_df[0], size)
            transfer_f1_results.set_value(index_f1_df[j], list_f1_df[1], dfKey)
            transfer_f1_results.set_value(index_f1_df[j], list_f1_df[2], "0")
            transfer_f1_results.set_value(index_f1_df[j], vectKey, f1_val_neg)
            j +=1
            #print(n)
            n += 1
        

In [14]:
#Calculating and displaying as transfer loss with Naive Bayes

def estimate_transfer_loss(size=5000):
   
    for vectKey in list_df:
        
        

        x = Sample_size.index(size)*len(list_df)
        y = Sample_size.index(size)*len(list_df)*len(num_classes)
                
        #print('vectKey',vectKey)
        for dfKey in list_df:
        #print('dfKey',dfKey)
        
            

            #AUC loss calculation
            pos_source = transfer_auc_results.columns.get_loc(dfKey)
            pos_target = transfer_auc_results.columns.get_loc(vectKey)
            transfer_loss = transfer_auc_results.iloc[index_auc_df[x]-1,pos_source] - transfer_auc_results.iloc[index_auc_df[x]-1,pos_target]

            transfer_auc_loss.set_value(index_auc_df[x], list_auc_df[0], size)
            transfer_auc_loss.set_value(index_auc_df[x], list_auc_df[1], dfKey)
            transfer_auc_loss.set_value(index_auc_df[x], vectKey, transfer_loss)
            
            
            #ACC loss calculation
            pos_source = transfer_acc_results.columns.get_loc(dfKey)
            pos_target = transfer_acc_results.columns.get_loc(vectKey)
            transfer_loss = transfer_acc_results.iloc[index_acc_df[x]-1,pos_source] - transfer_acc_results.iloc[index_acc_df[x]-1,pos_target]

            transfer_acc_loss.set_value(index_acc_df[x], list_acc_df[0], size)
            transfer_acc_loss.set_value(index_acc_df[x], list_acc_df[1], dfKey)
            transfer_acc_loss.set_value(index_acc_df[x], vectKey, transfer_loss)
            x += 1
            
            #f1 loss calculation
            pos_source = transfer_f1_results.columns.get_loc(dfKey)
            pos_target = transfer_f1_results.columns.get_loc(vectKey)
            transfer_loss = transfer_f1_results.iloc[index_f1_df[y]-1,pos_source] - transfer_f1_results.iloc[index_f1_df[y]-1,pos_target]
            transfer_f1_loss.set_value(index_f1_df[y], list_f1_df[0], size)
            transfer_f1_loss.set_value(index_f1_df[y], list_f1_df[1], dfKey)
            transfer_f1_loss.set_value(index_f1_df[y], list_f1_df[2], "1")
            transfer_f1_loss.set_value(index_f1_df[y], vectKey, transfer_loss)
            y +=1
            transfer_loss = transfer_f1_results.iloc[index_f1_df[y]-1,pos_source] - transfer_f1_results.iloc[index_f1_df[y]-1,pos_target]
            transfer_f1_loss.set_value(index_f1_df[y], list_f1_df[0], size)
            transfer_f1_loss.set_value(index_f1_df[y], list_f1_df[1], dfKey)
            transfer_f1_loss.set_value(index_f1_df[y], list_f1_df[2], "0")
            transfer_f1_loss.set_value(index_f1_df[y], vectKey, transfer_loss)
            y +=1
            
            #print(y)
          

# estimate_transfer_loss(5000)
# print(transfer_auc_results.to_string(float_format = '{:.01%}'.format))


In [16]:
for size in Sample_size:
    print("\n Train data_set size =",size)
    create_sized_data(size = size)
    create_base_NB_models()
    estimate_transfer_accuracy(size)
    #print(transfer_auc_results.to_string(float_format = '{:.01%}'.format))
    estimate_transfer_loss(size)
    
print(" ")        
print(" ")        
print(" ")   
print("Effectiveness of transfer learning with Naive Bayes:")
print("Area Under Curve values of rating predictions")
print("Columns = source domain, Rows = target domain\n")
print(transfer_auc_results.to_string(float_format = '{:.01%}'.format))
print(" ")        
print(" ")        
print(transfer_auc_loss.to_string(float_format = '{:.01%}'.format))
print(" ")        
print(" ") 
print("Effectiveness of transfer learning with Naive Bayes:")
print("Accuracy values of rating predictions")
print("Columns = source domain, Rows = target domain\n")
print(transfer_acc_results.to_string(float_format = '{:.01%}'.format))
print(" ")        
print(" ") 
print("Effectiveness of transfer learning with Naive Bayes:")
print("Accuracy values of rating predictions")
print("Columns = source domain, Rows = target domain\n")
print(transfer_acc_loss.to_string(float_format = '{:.01%}'.format))


print(" ")        
print(" ")  
print("Effectiveness of transfer learning with Naive Bayes:")
print("f1 values of rating predictions")
print("Columns = source domain, Rows = target domain\n")
print(transfer_f1_results.to_string(float_format = '{:.01%}'.format))

print(" ")        
print(" ")  
print("Effectiveness of transfer learning with Naive Bayes:")
print("f1 transfer loss of rating predictions")
print("Columns = source domain, Rows = target domain\n")
print(transfer_f1_loss.to_string(float_format = '{:.01%}'.format))
        
    #estimate_transfer_loss()


 Train data_set size = 10000

 Train data_set size = 50000

 Train data_set size = 100000

 Train data_set size = 200000
 
 
 
Effectiveness of transfer learning with Naive Bayes:
Area Under Curve values of rating predictions
Columns = source domain, Rows = target domain

   Sample_size Target_Domain  toys   vid   aut   hnk
1        10000          toys 91.5% 88.2% 87.6% 89.4%
2        10000           vid 84.9% 87.5% 86.3% 85.5%
3        10000           aut 85.6% 85.3% 91.4% 89.1%
4        10000           hnk 89.5% 88.2% 90.3% 91.2%
5        50000          toys 93.3% 91.2% 91.5% 91.4%
6        50000           vid 84.3% 89.4% 87.2% 85.6%
7        50000           aut 86.3% 84.4% 92.1% 88.0%
8        50000           hnk 90.7% 89.7% 92.2% 92.7%
9       100000          toys 93.7% 91.8% 91.9% 92.0%
10      100000           vid 85.0% 89.7% 87.9% 86.0%
11      100000           aut 86.1% 84.8% 92.4% 88.1%
12      100000           hnk 90.9% 90.0% 92.7% 93.2%
13      200000          toys 93.7% 91

### SVM Baseline
Similar approach to Naive Bayes, except we used SVM with linear kernel here. One more important distiction - we found that the SVM with full vocabulary was taking too long to run. Hence we added min_df = 5, and max_df = 0.8, and stop words = English as parameters to CountVectorizer to reduce the number of vocabulary words being considered to the most relevant and hence reduce the SVM dimensions.

In [12]:
# We use the tfid to give less weigthage to words occuring more frequently. At the same time, to get 
#rid of prepositions which are not helping with context, we use stop words.
from sklearn import svm

dict_vectorizers = {} #Dict to store the count_vectorizer model developed on each domain
dict_train_ids = {} #Dict to store train data reviews as sparse matrix of word ids
dict_dev_ids = {} #Dict to store dev data reviews as sparse matrix of word ids
dict_svm = {} #Dict to store svm model developed on each domain. Assumes input features are developed using the corresponding count_vectorizer
dict_dev_ypred = {} #Dict to store dev predictions
for key in list_df:
    
    #Converting ratings to tokenized word id counts as a sparse matrix using count_vectorizer
    dict_vectorizers[key] = TfidfVectorizer(min_df=5, max_df=0.8, stop_words='english')
    dict_train_ids[key] = dict_vectorizers[key].fit_transform(dict_train_df[key].reviewText)
    dict_dev_ids[key] = dict_vectorizers[key].transform(dict_dev_df[key].reviewText)
    print("Number words in training corpus for",key,len(dict_vectorizers[key].get_feature_names()))
    print(key,'dataset id shapes',dict_train_ids[key].shape, dict_dev_ids[key].shape)
    
    start =time.time()
    #Building an SVM model to predict the ratings
    dict_svm[key] = svm.SVC(kernel='linear')
    dict_svm[key].fit(dict_train_ids[key],dict_train_y[key])
    print(key, " Training done")
    
    dict_dev_ypred[key] = dict_svm[key].predict(dict_dev_ids[key])
    print(key, " Prediction done")
    
    acc = accuracy_score(dict_dev_y[key], dict_dev_ypred[key])
    print("Accuracy on",key,"dev set for binary prediction with ", key, "SVM model: {:.02%}".format(acc))
    print('Corresponding classification report\n',classification_report(dict_dev_y[key], dict_dev_ypred[key]))
    stop = time.time()
    print("Time for ", key, ': ',stop-start)
    
    


Number words in training corpus for toys 21335
toys dataset id shapes (150000, 21335) (45000, 21335)
toys  Training done
toys  Prediction done
Accuracy on toys dev set for binary prediction with  toys SVM model: 93.77%
Corresponding classification report
              precision    recall  f1-score   support

        0.0       0.85      0.71      0.77      6730
        1.0       0.95      0.98      0.96     38270

avg / total       0.94      0.94      0.94     45000

Time for  toys :  3775.540815591812
Number words in training corpus for vid 30301
vid dataset id shapes (150000, 30301) (45000, 30301)
vid  Training done
vid  Prediction done
Accuracy on vid dev set for binary prediction with  vid SVM model: 91.99%
Corresponding classification report
              precision    recall  f1-score   support

        0.0       0.84      0.71      0.77      8450
        1.0       0.93      0.97      0.95     36550

avg / total       0.92      0.92      0.92     45000

Time for  vid :  6111.884172

In [14]:
#Accuracy of transfer learning with svm

dict_transfer_ids = {} #Dictionary to store the dev vector ids for dataframe A(df) using the count_vectorizer of dataframe B(vect)
transfer_results = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store accuracy on transfer. Col = Model, row = dataframe
transfer_results_f1_pos = pd.DataFrame(index=list_df,columns=list_df)
transfer_results_f1_neg = pd.DataFrame(index=list_df,columns=list_df)

for vectKey in list_df:
    dict_transfer_ids[vectKey] = {}
    #print('vectKey',vectKey)
    for dfKey in list_df:
        #print('dfKey',dfKey)
        dict_transfer_ids[vectKey][dfKey] = dict_vectorizers[vectKey].transform(dict_dev_df[dfKey].reviewText)
        #print(dfKey,'dataset using ',vectKey,' count vectorizer, id shapes',dict_transfer_ids[vectKey][dfKey].shape)
        dict_dev_ypred = dict_svm[vectKey].predict(dict_transfer_ids[vectKey][dfKey])
        acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
        #print("Accuracy on ",dfKey," dev set for binary prediction with ", vectKey," naive bayes model: {:.02%}".format(acc))
        transfer_results[vectKey][dfKey] = acc
        
        #Adding f1 score 
        f1_val_pos = (f1_score(dict_dev_y[dfKey],dict_dev_ypred,  average=None))[1]#pos class
       
        f1_val_neg = (f1_score(dict_dev_y[dfKey],dict_dev_ypred,  average=None))[0]#neg class
        transfer_results_f1_pos[vectKey][dfKey] = f1_val_pos
        transfer_results_f1_neg[vectKey][dfKey] = f1_val_neg


print("Effectiveness of transfer learning with SVM:")
print("Accuracy of rating predictions")
print("Colums = source domain, Rows = target domain\n")
print(transfer_results.to_string(float_format = '{:.01%}'.format))

Effectiveness of transfer learning with SVM:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 93.8% 92.5% 92.3% 92.7%
vid  89.7% 92.0% 87.6% 88.7%
aut  90.5% 90.0% 92.5% 91.3%
hnk  91.6% 90.9% 91.8% 93.0%


In [15]:
print(transfer_results_f1_pos.to_string(float_format = '{:.01%}'.format))
print(transfer_results_f1_neg.to_string(float_format = '{:.01%}'.format))

      toys   vid   aut   hnk
toys 96.4% 95.7% 95.6% 95.8%
vid  93.8% 95.2% 92.8% 93.4%
aut  94.5% 94.2% 95.7% 94.9%
hnk  95.0% 94.7% 95.2% 95.8%
      toys   vid   aut   hnk
toys 77.3% 71.2% 70.6% 73.3%
vid  67.7% 76.9% 55.3% 62.4%
aut  66.3% 63.6% 71.0% 68.5%
hnk  73.0% 70.1% 71.8% 77.6%


In [19]:
#Calculating and displaying as transfer loss with svm
transfer_loss = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store loss in accuracy on transfer. Col = Model, row = dataframe
transfer_loss_f1_pos = pd.DataFrame(index=list_df,columns=list_df) 
transfer_loss_f1_neg = pd.DataFrame(index=list_df,columns=list_df) 
for A in list_df:
    for B in list_df:
        transfer_loss[A][B] = transfer_results[B][B] - transfer_results[A][B]
        transfer_loss_f1_pos[A][B] = transfer_results_f1_pos[B][B] - transfer_results_f1_pos[A][B]
        transfer_loss_f1_neg[A][B] = transfer_results_f1_neg[B][B] - transfer_results_f1_neg[A][B]
print("Transfer loss on rating predictions with SVM")
print("Colums = source domain, Rows = target domain\n")
print(transfer_loss.to_string(float_format = '{:.01%}'.format))
print("")
print("f1 pos")
print(transfer_loss_f1_pos.to_string(float_format = '{:.01%}'.format))
print("")
print("f1 neg")
print(transfer_loss_f1_neg.to_string(float_format = '{:.01%}'.format))

Transfer loss on rating predictions with SVM
Colums = source domain, Rows = target domain

     toys  vid  aut  hnk
toys 0.0% 1.3% 1.5% 1.1%
vid  2.3% 0.0% 4.4% 3.2%
aut  2.0% 2.5% 0.0% 1.3%
hnk  1.4% 2.1% 1.2% 0.0%

f1 pos
     toys  vid  aut  hnk
toys 0.0% 0.7% 0.8% 0.6%
vid  1.3% 0.0% 2.3% 1.8%
aut  1.2% 1.5% 0.0% 0.8%
hnk  0.8% 1.2% 0.6% 0.0%

f1 neg
     toys  vid   aut   hnk
toys 0.0% 6.1%  6.7%  4.0%
vid  9.2% 0.0% 21.6% 14.5%
aut  4.7% 7.4%  0.0%  2.4%
hnk  4.6% 7.5%  5.8%  0.0%


### Naive Bayes Baseline updated for CountVectorizer parameters to make them similar to SVM.

ie set min_df = 5, max_df = 0.8, stop_words = english in CountVectorizer.

In [14]:
#Converting reviews to sparse matrix of word ids with count vectorizer, and using Naive Bayes to make the prediction.
#This section also creates the count_vectorizer and Naive Bayes models for each domain to be used to test transfer learning

from sklearn.naive_bayes import MultinomialNB
dict_vectorizers = {} #Dict to store the count_vectorizer model developed on each domain
dict_train_ids = {} #Dict to store train data reviews as sparse matrix of word ids
dict_dev_ids = {} #Dict to store dev data reviews as sparse matrix of word ids
dict_nb = {} #Dict to store naive bayes model developed on each domain. Assumes input features are developed using the corresponding count_vectorizer
dict_dev_ypred = {} #Dict to store dev predictions
for key in list_df:
    
    #Converting ratings to tokenized word id counts as a sparse matrix using count_vectorizer
    dict_vectorizers[key] = CountVectorizer(min_df=5, max_df=0.8, stop_words='english')
    dict_train_ids[key] = dict_vectorizers[key].fit_transform(dict_train_df[key].reviewText)
    dict_dev_ids[key] = dict_vectorizers[key].transform(dict_dev_df[key].reviewText)
    print("Number words in training corpus for",key,len(dict_vectorizers[key].get_feature_names()))
    print(key,'dataset id shapes',dict_train_ids[key].shape, dict_dev_ids[key].shape)
    
    #Building a Naive Bayes model to predict the ratings
    dict_nb[key] = MultinomialNB()
    dict_nb[key].fit(dict_train_ids[key],dict_train_y[key])
    dict_dev_ypred[key] = dict_nb[key].predict(dict_dev_ids[key])
    acc = accuracy_score(dict_dev_y[key], dict_dev_ypred[key])
    print("Accuracy on",key,"dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))
    print('Corresponding classification report\n',classification_report(dict_dev_y[key], dict_dev_ypred[key]))

Number words in training corpus for toys 17651
toys dataset id shapes (100000, 17651) (30000, 17651)
Accuracy on toys dev set for binary prediction with toys naive bayes model: 91.59%
Corresponding classification report
              precision    recall  f1-score   support

        0.0       0.70      0.76      0.73      4465
        1.0       0.96      0.94      0.95     25535

avg / total       0.92      0.92      0.92     30000

Number words in training corpus for vid 25014
vid dataset id shapes (100000, 25014) (30000, 25014)
Accuracy on vid dev set for binary prediction with toys naive bayes model: 88.54%
Corresponding classification report
              precision    recall  f1-score   support

        0.0       0.68      0.73      0.70      5642
        1.0       0.94      0.92      0.93     24358

avg / total       0.89      0.89      0.89     30000

Number words in training corpus for aut 15599
aut dataset id shapes (100000, 15599) (30000, 15599)
Accuracy on aut dev set for bina

In [16]:
#Accuracy of transfer learning with Naive Bayes

dict_transfer_ids = {} #Dictionary to store the dev vector ids for dataframe A(df) using the count_vectorizer of dataframe B(vect)
transfer_results = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store accuracy on transfer. Col = Model, row = dataframe
transfer_results_f1_pos = pd.DataFrame(index=list_df,columns=list_df)
transfer_results_f1_neg = pd.DataFrame(index=list_df,columns=list_df)
for vectKey in list_df:
    dict_transfer_ids[vectKey] = {}
    #print('vectKey',vectKey)
    for dfKey in list_df:
        #print('dfKey',dfKey)
        dict_transfer_ids[vectKey][dfKey] = dict_vectorizers[vectKey].transform(dict_dev_df[dfKey].reviewText)
        #print(dfKey,'dataset using ',vectKey,' count vectorizer, id shapes',dict_transfer_ids[vectKey][dfKey].shape)
        dict_dev_ypred = dict_nb[vectKey].predict(dict_transfer_ids[vectKey][dfKey])
        acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
        #print("Accuracy on ",dfKey," dev set for binary prediction with ", vectKey," naive bayes model: {:.02%}".format(acc))
        transfer_results[vectKey][dfKey] = acc
        
        

print("Effectiveness of transfer learning with Naive Bayes:")
print("Accuracy of rating predictions")
print("Colums = source domain, Rows = target domain\n")
print(transfer_results.to_string(float_format = '{:.01%}'.format))


Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 91.6% 91.0% 90.8% 91.3%
vid  86.0% 88.5% 88.1% 87.2%
aut  74.1% 78.5% 91.1% 82.6%
hnk  83.9% 85.5% 90.8% 90.8%


In [17]:
#Calculating and displaying as transfer loss with svm
transfer_loss = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store loss in accuracy on transfer. Col = Model, row = dataframe
for A in list_df:
    for B in list_df:
        transfer_loss[A][B] = transfer_results[B][B] - transfer_results[A][B]
print("Transfer loss on rating predictions with Naive Bayes")
print("Colums = source domain, Rows = target domain\n")
print(transfer_loss.to_string(float_format = '{:.01%}'.format))

Transfer loss on rating predictions with Naive Bayes
Colums = source domain, Rows = target domain

      toys   vid   aut  hnk
toys  0.0%  0.6%  0.8% 0.3%
vid   2.5%  0.0%  0.5% 1.3%
aut  17.0% 12.7%  0.0% 8.5%
hnk   6.9%  5.3% -0.0% 0.0%


### Calculating similarity / difference between domains

This section calculates similarity / distance based on two possible metrics: JS Divergence, Cosine Similarity.
In order to calculate this, we first created a word id index for all 4 datasets' reviews combined. Then we calculated the distribution of each domain on this integrated word id index to estimate the divergence.

In [45]:
#Create a function to calculate JS Divergence using two discrete distributions.
from scipy.stats import entropy
from scipy import spatial
#from scipy.sparse.linalg import norm
from numpy.linalg import norm

def JSD(P, Q):
   _P = P / norm(P, ord=1)
   _Q = Q / norm(Q, ord=1)
   _M = 0.5 * (_P + _Q)
   return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [41]:
#Create a vocabulary on the reviewText of all dataframes for the sake of comparing their distributions on the same baseline.
all_df_reviews = pd.DataFrame(columns = dict_train_df[list_df[0]].columns)
for key in list_df:
    #print(dict_train_df[key].shape)
    all_df_reviews = pd.concat([dict_train_df[key],all_df_reviews])
print(all_df_reviews.shape)
#print(type(all_df_reviews))
#print(all_df_reviews.columns)

all_vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words='english')
all_ids = all_vectorizer.fit_transform(all_df_reviews.reviewText)
print("Number words in training corpus for",key,len(all_vectorizer.get_feature_names()))

#Create a word if distribution of each df on the integrated vocabulary ids.
dict_allVocab_ids = {}
for key in list_df:
    dict_allVocab_ids[key] = all_vectorizer.transform(dict_train_df[key].reviewText)
    print(key,dict_allVocab_ids[key].shape)

(400000, 9)
Number words in training corpus for hnk 41262
toys (100000, 41262)
vid (100000, 41262)
aut (100000, 41262)
hnk (100000, 41262)


In [50]:
JSD_results = pd.DataFrame(index=list_df,columns=list_df)
cosine_results = pd.DataFrame(index=list_df,columns=list_df)
for key1 in list_df:
   for key2 in list_df:
       dict_train_ids_1 = dict_allVocab_ids[key1].sum(axis=0).T
       dict_train_ids_2 = dict_allVocab_ids[key2].sum(axis=0).T
       #print(dict_allVocab_ids[key1].shape,dict_train_ids_1.shape,dict_train_ids_2.shape)
       JSD_results[key1][key2] = JSD(dict_train_ids_1,dict_train_ids_2)
       cosine_results[key1][key2] = spatial.distance.cosine(dict_train_ids_1,dict_train_ids_2)
       
print('JS Divergence')
print(JSD_results)
print('\nCosine Distance')
print(cosine_results)

JS Divergence
                  toys               vid               aut               hnk
toys             [0.0]  [0.121900206869]  [0.149253766723]  [0.129181255531]
vid   [0.121900206869]             [0.0]  [0.197747372379]  [0.200978538535]
aut   [0.149253766723]  [0.197747372379]             [0.0]  [0.118183647432]
hnk   [0.129181255531]  [0.200978538535]  [0.118183647432]             [0.0]

Cosine Distance
          toys       vid          aut          hnk
toys         0  0.345511     0.287423     0.236629
vid   0.345511         0     0.547328     0.526993
aut   0.287423  0.547328  2.22045e-16     0.144065
hnk   0.236629  0.526993     0.144065 -2.22045e-16


In [44]:
cosine_results = pd.DataFrame(index=list_df,columns=list_df)
from scipy import spatial
dict_train_ids_1 = dict_allVocab_ids['toys'].sum(axis=0).T
dict_train_ids_2 = dict_allVocab_ids['aut'].sum(axis=0).T
result = spatial.distance.cosine(dict_train_ids_1,dict_train_ids_2)
print(result)

0.287422911472


### Selected results from past test runs

#### Impact of sample size on accuracy (analysis done with Naive Bayes model, on toys dataset)

Toys data set only, with bottoms up word id creation using process similar to assignment 2.
Impact of changing size of train data set with Naive Bayes.

With number in train set = 10000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 88.74%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 67.16%
    Vocab Size : 38696
    
With number in train set = 50000 (excl 3 ratings)   
    Accuracy on dev set for binary prediction: 91.33%   
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 69.33% 
    Vocab Size : ~ ..
    
With number in train set = 100000 (excl 3 ratings)
    Accuracy on dev set for binary prediction: 91.56%   
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.42%
    Vocab Size : 105304

With number in train set = 500000, dev set = 150000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 91.73%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.95%
    vocab size 307822
    
With number in train set = 1200000, dev set = 360000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 91.92%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 71.24%
    vocab size 674074 (not repeated with correction for vocab)
    
#### Conclusion: There isn't a material increase in accuracy with Naive Bayes after 100000 data points in the train set.
    
### Output from trying different pre-processing with the toys review set.
 
 Accuracy on dev set for binary prediction: 91.69%
classification report naive bayes binary classification 
              precision    recall  f1-score   support

        0.0       0.70      0.77      0.74     22472
        1.0       0.96      0.94      0.95    127528

avg / total       0.92      0.92      0.92    150000

Accuracy on dev set for binary prediction with count vectorizer (no min, max df,stop words set): 91.92%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.71      0.79      0.75     22472
        1.0       0.96      0.94      0.95    127528

avg / total       0.92      0.92      0.92    150000

Accuracy on dev set for binary prediction with tfidf (no min, max df,stop words set): 90.13%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.90      0.38      0.54     22472
        1.0       0.90      0.99      0.94    127528

avg / total       0.90      0.90      0.88    150000

Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.91%
classification report naive bayes multinomial classification with tfidf 
              precision    recall  f1-score   support

          1       0.60      0.74      0.66     13975
          2       0.32      0.05      0.09      8497
          4       0.42      0.34      0.37     29733
          5       0.80      0.87      0.83     97795

avg / total       0.68      0.71      0.68    150000

#### Conclusion: CountVectorizer pre-prcocessing gives the best results with Naive Bayes.
#### Also note that the accuracy for 4 level (1,2,4,5) prediction is much worse than for binary prediction.


### Past test runs Detailed Results for transfer learning from toys to video games
number words in training corpus for toys: 63984    
toys dataset id shapes (100000, 63984) (30000, 63984)    
number words in training corpus for video games: 98899    
videos dataset id shapes (100000, 98899) (30000, 98899)    
number words in training corpus for automobiles: 59468    
automobile dataset id shapes (100000, 59468) (30000, 59468)    
number words in training corpus for home and kitchen: 57884    
home and kitchen dataset id shapes (100000, 57884) (30000, 57884)    

Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.23%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.74      0.74      0.74      4503
        1.0       0.95      0.95      0.95     25497

avg / total       0.92      0.92      0.92     30000

Accuracy on video games dev set for binary prediction with video games naive bayes model: 89.16%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.72      0.71      0.71      5725
        1.0       0.93      0.93      0.93     24275

avg / total       0.89      0.89      0.89     30000

Accuracy on autos dev set for binary prediction with autos naive bayes model: 91.93%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.78      0.61      0.69      4323
        1.0       0.94      0.97      0.95     25677

avg / total       0.91      0.92      0.92     30000

Accuracy on home and kitchen dev set for binary prediction with home and kitchen naive bayes model: 91.37%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.76      0.71      0.73      5072
        1.0       0.94      0.96      0.95     24928

avg / total       0.91      0.91      0.91     30000

### Transfer learning:

Accuracy on video games dev set for binary prediction with toys naive bayes model: 86.99%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.66      0.65      0.66      5725
        1.0       0.92      0.92      0.92     24275

avg / total       0.87      0.87      0.87     30000

Accuracy on automobiles dev set for binary prediction with toys naive bayes model: 76.06%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.36      0.88      0.51      4323
        1.0       0.97      0.74      0.84     25677

avg / total       0.88      0.76      0.79     30000

Accuracy on home and kitchen dev set for binary prediction with toys naive bayes model: 85.78%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.55      0.85      0.67      5072
        1.0       0.97      0.86      0.91     24928

avg / total       0.90      0.86      0.87     30000

Accuracy on toys dev set for binary prediction with video games naive bayes model: 91.53%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.76      0.63      0.69      4503
        1.0       0.94      0.97      0.95     25497

avg / total       0.91      0.92      0.91     30000

Accuracy on automobiles dev set for binary prediction with video games naive bayes model: 80.50%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.41      0.77      0.53      4323
        1.0       0.96      0.81      0.88     25677

avg / total       0.88      0.81      0.83     30000