In [1]:
import numpy as np
import pandas as pd

import scipy
import re
import string
from collections import Counter

import gzip

In [2]:
from sklearn.base import clone
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn import metrics

from imblearn.under_sampling import RandomUnderSampler

from nltk import word_tokenize          
from nltk import pos_tag
# from nltk.corpus import stopwords

  from numpy.core.umath_tests import inner1d


In [3]:
# import sys 
# import tensorflow as tf
# from tensorflow import keras
# print ('Is GPU available ? ', tf.test.is_gpu_available())


# print(sys.version)

- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- helpful - helpfulness rating of the review, e.g. 2/3
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)

### *"just based on text"*
- does that mean just use 'reviewText'? or including 'summary' as well?
    - 'reviewerID'; how to check bias, scoring standards (, and maybe fake/multiple accounts?)
    - 'reviewerName; ???
    - 'asin'/product; group by products, price (from another file, not public)
    - 'helpful'; check 'helpful' and 'overall' (target) relationship
        - 'helpful' score will be lower for lower than the 'asin's average 'overall' scores
        
#### Will just be doing 'reviewText' and 'summary', see which translates to a better score predictor

In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path, size=False):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
    if i == 500:
        if size==True:
            break
        
    
  return pd.DataFrame.from_dict(df, orient='index')


In [5]:
def undersample(X, y, undersampler):
    
    # undersampling using index for X
    X_index = pd.DataFrame(X.index.values)
    
    X_res, y_res = undersampler.fit_sample(X_index, y)
    
    
    # creating X using index(=X_res)
    res_ind = [ind for line in X_res for ind in line]

    X_res1 = X.loc[res_ind, :]
    
    
    return X_res1, y_res

In [6]:
def get_split_data():
    
    df = getDF('reviews_Movies_and_TV_5.json.gz')
    
    y = df.overall
    X = df[['reviewText', 'summary']]

    # taking out punctuations
    X.loc[:, 'reviewText'] = X.reviewText.apply(lambda x: re.sub(r'[^\w\s]','', x))
    X.loc[:, 'summary'] = X.summary.apply(lambda x: re.sub(r'[^\w\s]','', x))
    
    # holdout set
    X_df, X_holdout, y_df, y_holdout = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
    
    
    # undersampling, using index for X
    us = RandomUnderSampler(ratio='not minority', random_state=42, )  

    X_res, y_res = undersample(X_df, y_df, us)
    
    
    
    
    # train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.3, stratify=y_res, random_state=42)
    
    y_bin_train = np.array(list(map(lambda x: 1 if x>=4 else 0, y_train)))
    y_bin_test = np.array(list(map(lambda x: 1 if x>=4 else 0, y_test)))
    y_bin_holdout = np.array(list(map(lambda x: 1 if x>=4 else 0, y_holdout)))
    
    
    X_dict = {'train':X_train, 'test':X_test, 'holdout':X_holdout}
    
    y_dict = {'train':y_train, 'test':y_test, 'holdout':y_holdout,
              'bin_train':y_bin_train, 'bin_test':y_bin_test, 'bin_holdout':y_bin_holdout}
    
    return X_dict, y_dict

In [7]:
hash_mnb = Pipeline([('count', HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**18)), 
                     ('mnb', MultinomialNB())])

hash_bnb = Pipeline([('binary', HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**18, binary=True)), 
                     ('bnb', BernoulliNB())])

hash_lr = Pipeline([('count', HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**18)), 
                     ('lr', LogisticRegression(penalty='l2', solver='sag', multi_class='multinomial',
                                                random_state=42, tol=0.0002))])


tfidf_mnb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', sublinear_tf=True)),
                       ('tfidf_mnb', MultinomialNB())])

tfidf_lr = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', sublinear_tf=True)),
                       ('tfidf_lr', MultinomialNB())])

In [8]:
class Bunch(object):
  def __init__(self, adict):
    self.__dict__.update(adict)

In [9]:
def models_basic(models, X_dict, y_dict, i=1, review='reviewText'):
    
    X = Bunch(X_dict)
    y = Bunch(y_dict)
    
    for model in models:
        
        print('----'*4, model.steps[i][0].upper(), '----'*4)
        model2 = clone(model)
        model.fit(X.train[review], y.train)

        print('\n')
        print('----'*9)
        print('Test set')

        y_ = model.predict(X.test[review])
        y_bin_ = np.array(list(map(lambda x: 1 if x>=4 else 0, y_)))

        print('Raw prediction score: ', model.score(X.test[review], y.test))
        print('Raw-to-good/bad score: ', np.mean(y_bin_ == y.bin_test) )

        model2.fit(X.train[review], y.bin_train)

        print('Good/bad prediction score: ', model2.score(X.test[review], y.bin_test))

        print('\n')
        print('----'*9)
        print('Holdout set')

        y_h_ = model.predict(X.holdout[review])
        y_h_bin_ = np.array(list(map(lambda x: 1 if x>=4 else 0, y_h_)))

        print('Raw prediction score: ', model.score(X.holdout[review], y.holdout))
        print('Raw-to-good/bad score: ', np.mean(y_h_bin_ == y.bin_holdout) )
        print('Good/bad prediction score: ', model2.score(X.holdout[review], y.bin_holdout))
        print('\n\n')
    


***

In [10]:
models = [hash_mnb, hash_bnb, hash_lr, tfidf_mnb, tfidf_lr]

In [11]:
X_dict, y_dict = get_split_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [12]:
models_basic(models, X_dict, y_dict)

---------------- MNB ----------------


------------------------------------
Test set
Raw prediction score:  0.46835534595744066
Raw-to-good/bad score:  0.7974886802551825
Good/bad prediction score:  0.7169123497330999


------------------------------------
Holdout set
Raw prediction score:  0.4646370630441698
Raw-to-good/bad score:  0.7783910835679866
Good/bad prediction score:  0.522349988807333



---------------- BNB ----------------


------------------------------------
Test set
Raw prediction score:  0.39037568533279327
Raw-to-good/bad score:  0.7193281930360061
Good/bad prediction score:  0.7023738915329756


------------------------------------
Holdout set
Raw prediction score:  0.5154399896320558
Raw-to-good/bad score:  0.8023846271663702
Good/bad prediction score:  0.7850477750156108



---------------- LR ----------------


------------------------------------
Test set
Raw prediction score:  0.5094463812981903
Raw-to-good/bad score:  0.8137486076352222
Good/bad prediction s

In [None]:
# df = getDF('reviews_Movies_and_TV_5.json.gz', size=True)
# X = df[['summary']]
# y = df.overall
# us = RandomUnderSampler(ratio='not minority', random_state=42, )  

# X_res, y_res = us.fit_sample(X, y)
# tfidf_mnb.fit(X['summary'], y)