In [1]:
import boto3
import json
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re
np.random.seed(0)
import io
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle
from scipy.stats import randint as sp_randint
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB



In [2]:
## Importing the dataset from an S3 storage location
# create the s3 client
s3 = boto3.client('s3')

# this is the location of the data on S3 (usual)
bucket='yelpreviewsdata' # put your S3 bucket name here
prefix = 'data'
obj = s3.get_object(Bucket=bucket, Key=f'{prefix}/yelp_review-0000.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding="iso-8859-15", low_memory=True)
# creating a dataframe from the attributes needed for the classifier
df = df[['stars', 'text']]
# binarizing the sentiment to 0s and 1s
df['stars']= np.where(df['stars'] >=2, 1,0)
print(df.head())
print(df.shape)


   stars                                               text
0      1  Super simple place but amazing nonetheless. It...
1      1  Small unassuming place that changes their menu...
2      1  Lester's is located in a beautiful neighborhoo...
3      1  Love coming here. Yes the place always needs t...
4      1  Had their chocolate almond croissant and it wa...
(296227, 2)


In [3]:
# sampling from the dataset due to limitations preventing us from vectorizing the full set
def sampling_dataset(df):
    count = 150000
    class_df_sampled = pd.DataFrame(columns = ["stars","text"])
    temp = []
    for c in df.stars.unique():
        class_indexes = df[df.stars == c].index
        random_indexes = np.random.choice(class_indexes, count, replace=True)
        temp.append(df.loc[random_indexes])
        
    for each_df in temp:
        class_df_sampled = pd.concat([class_df_sampled,each_df],axis=0)
    
    return class_df_sampled

df = sampling_dataset(df)
df.reset_index(drop=True,inplace=True)

In [4]:
#labelling the rating text to be fed into the model
lmtzr = WordNetLemmatizer()
w = re.compile("\w+",re.I)

def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w,datapoint["text"].lower())
        labeled_sentences.append(LabeledSentence(words=tokenized_words, tags=['SENT_%s' %index]))
    return labeled_sentences

def train_doc2vec_model(labeled_sentences):
    model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025, iter=20)
    model.build_vocab(labeled_sentences)
    model.train(labeled_sentences, epochs=model.iter, total_examples=model.corpus_count)
    return model

sen = label_sentences(df)
# training the doc2vec model
model = train_doc2vec_model(sen)




In [5]:
#pre-defined function to vectorize the text provided for each rating using the doc2vec model trained earlier
def vectorize_ratings(df,d2v_model):
    y = []
    ratings = []
    for i in range(0,df.shape[0]):
        label = 'SENT_%s' %i
        ratings.append(d2v_model.docvecs[label])
    df['vectorized_ratings'] = ratings
    
    return df

df = vectorize_ratings(df,model)
print (df.head(5))

  stars                                               text  \
0     1  The VIP has one of the best chicken wings in t...   
1     1  We so wanted this new restaurant and brew hous...   
2     1  I wasn't sure if I'd like Pilates on the refor...   
3     1  This place is awesome. Lots of great rolls at ...   
4     1  Super delicious Hainanese chicken and rice joi...   

                                  vectorized_ratings  
0  [0.7370991, 0.1282141, 0.88223875, 0.7988065, ...  
1  [0.4672933, -0.61399937, 0.10503226, 0.8451672...  
2  [0.72146267, 0.8445171, -1.6559016, 1.0426472,...  
3  [-0.16004893, 0.5494206, 0.1510869, 1.3756956,...  
4  [-0.43464002, 0.74114853, -0.5584688, 0.765369...  


In [6]:
#splitting for cross-validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df["vectorized_ratings"].T.tolist(), df["stars"], test_size=0.02, random_state=17)
y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [7]:
# logistic regression classifier
logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)
preds = logreg.predict(X_test)
print("Logistic Classifier score", sum(preds == y_test) / len(y_test) )

Logistic Classifier score 0.7076666666666667


In [8]:
def train_classifier(X,y):
    n_estimators = [200,400]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]

    parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                  'min_samples_split': min_samples_split}

    clf = GridSearchCV(RFC(verbose=1,n_jobs=4), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf


In [9]:
#Randomized search for model selection
clf = tree.DecisionTreeClassifier()
# Utility function to report best scores
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean OOB score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 4),
              "min_samples_leaf": sp_randint(1, 5),
              "criterion": ["gini", "entropy"]}


# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

random_search.fit(X_train, y_train)
#this might take a minute to run
print("RandomizedSearchCV examined %d candidate parameter settings." % (n_iter_search))
report(random_search.cv_results_)



RandomizedSearchCV examined 30 candidate parameter settings.
Model with rank: 1
Mean OOB score: 0.583 (std: 0.001)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1}

Model with rank: 2
Mean OOB score: 0.579 (std: 0.001)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_leaf': 1}

Model with rank: 3
Mean OOB score: 0.579 (std: 0.001)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_leaf': 1}

Model with rank: 4
Mean OOB score: 0.578 (std: 0.001)
Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1}

Model with rank: 5
Mean OOB score: 0.578 (std: 0.002)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_leaf': 1}



In [10]:
# Running the model with the best score from the CV above
clf = tree.DecisionTreeClassifier(max_features = 3, criterion = 'entropy', min_samples_leaf = 1)

clf = clf.fit(X_train, y_train)

    
y_pred=clf.predict(X_test)
y_scores=clf.predict_proba(X_test)
print ('\nconfusion matrix')
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_scores[:,1])
print('\nauc score '+str(auc(false_positive_rate, true_positive_rate)))

#show a tradeoff curve for precision vs recall


confusion matrix
Predicted     0     1   All
True                       
0          1847  1167  3014
1          1178  1808  2986
All        3025  2975  6000

auc score 0.6091495992579394


In [11]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
preds = gnb.predict(X_test)
print("Gaussian NB score", sum(preds == y_test) / len(y_test) )

Gaussian NB score 0.5608333333333333


In [12]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
preds = bnb.predict(X_test)
print("Bernoulli NB score", sum(preds == y_test) / len(y_test) )

Bernoulli NB score 0.6348333333333334


In [13]:
#References:
# Alexander Andrews, Content Based Text Classification with Doc2Vec and TensorFlow, https://blog.francium.tech/content-based-text-classification-with-doc2vec-and-tensorflow-efd1dd4f02a8
#Tushar Joshi , Sentiment Classification in Doc2Vec https://www.kaggle.com/tj2552/sentiment-classification-in-5-classes-doc2vec
# David Batista, Document Classification, http://www.davidsbatista.net/blog/2017/04/01/document_classification/