In [4]:
import os 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
#os.getcwd()
os.chdir("C:\\Users\\Mommy\\Desktop\\Self_Learning\\yelp")

In [8]:
yelp_data = pd.read_csv("yelp.csv")

In [9]:
#yelp_data.head(10)
yelp_data.shape

(10000, 10)

In [10]:
yelp_data.columns

Index([u'business_id', u'date', u'review_id', u'stars', u'text', u'type',
       u'user_id', u'cool', u'useful', u'funny'],
      dtype='object')

In [137]:
#yelp_data.head(10)

In [11]:
yelp_data.stars.shape

(10000L,)

In [12]:
# Filter data frame to have rows with stars being 5 and 1 only. These are the two classes - success and not success. 
yelp_data1 = yelp_data[yelp_data['stars']==1]

In [13]:
yelp_data2 = yelp_data[yelp_data['stars']==5]

In [14]:
yelp_data1.shape

(749, 10)

In [15]:
yelp_data2.shape

(3337, 10)

In [16]:
yelp_final = pd.merge(yelp_data1, yelp_data2, on='stars')

In [17]:
3337+749

4086

In [18]:
yelp_final = yelp_data.apply(lambda row: row[yelp_data['stars'].isin([5,1])])

In [138]:
# Further you can have yelp_final to have only two columns - the review text and the stars.
# yelp_final.head(10)

In [19]:
yelp_final2 = yelp_final[['stars','text']]

In [20]:
# Split into training and testing datasets : 
# initialize X and Y as series :  

X = yelp_final.text 
Y = yelp_final.stars 

from sklearn.cross_validation import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=1)



In [21]:
# check sizes of the train and test datasets
X_train.shape

(3064L,)

In [22]:
Y_train.shape

(3064L,)

In [56]:
X_test.shape

(1022L,)

In [23]:
Y_test.shape

(1022L,)

In [24]:
# check if X is series
type(X_train)

pandas.core.series.Series

In [25]:
type(X_test)

pandas.core.series.Series

In [26]:
# import countVectorizer module from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the model
count_vec = CountVectorizer()

In [27]:
# now we need the coun vectorizer model to learn the vocabulary : 
# we need to first learn the vocabulary of the reviews 
# and then create a document term amtrix - 
# with that we will have the mapping of the occurence of each vocab with its occurence - so as you
# go through the row - you build the review/the statement. 

count_vec.fit(X_train)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [139]:
# get feature names or the trained vocab of the review section : 
# count_vec.get_feature_names()

In [28]:
# How to build the doucment term matrix ? 
# use the transform function using . operator on the model you trained. 

yelp_dtm_train =  count_vec.transform(X_train)
yelp_dtm_test = count_vec.transform(X_test)
#help(count_vec.transform)

In [29]:
import pandas as pd
yelp_vect_df_train =  pd.DataFrame(yelp_dtm_train.toarray(),columns=count_vec.get_feature_names())
yelp_vect_df_train.head(3)

Unnamed: 0,00,000,00a,00am,00pm,01,02,03,03342,04,...,zucchini,zuchinni,zumba,zupa,zuzu,zwiebel,zzed,éclairs,école,ém
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# create the document term matrix for the 
yelp_dtm_test = count_vec.transform(X_test)

# create dataframe having column names as trained vocab and rows representing the statements.
yelp_vect_df_test = pd.DataFrame(yelp_dtm_test.toarray(), columns = count_vec.get_feature_names())

# get shape
yelp_vect_df_test.shape

(1022, 16825)

In [96]:
# Now ML begins !!!

In [31]:
# now I have the document term matrix - so now using it what to do ? 
# well using the document term matrix, we have integer feature representation, so we will use it to train a ML model
# along with the star rating. 

from sklearn.naive_bayes import MultinomialNB

# instantiate model 
mult_nb = MultinomialNB()

# train the data : 
mult_nb.fit(yelp_vect_df_train,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
# testing the data : 
Y_pred_mult_nb = mult_nb.predict(yelp_vect_df_test)

In [33]:
print(Y_pred_mult_nb)

[5 5 5 ... 5 1 5]


In [35]:
#now we check the accuracy of our model 
from sklearn import metrics

metrics.accuracy_score(Y_test,Y_pred_mult_nb)

#0.9187866927592955

0.9187866927592955

In [36]:
# get roc_auc score and get the confusion matrix as well
# Let us print the confusion matrix first : 

metrics.confusion_matrix(Y_test,Y_pred_mult_nb)

array([[126,  58],
       [ 25, 813]], dtype=int64)

In [106]:
#help(metrics.classification_report)
#metrics.classification_report(Y_test,Y_pred_mult_nb)

u'             precision    recall  f1-score   support\n\n          1       0.83      0.68      0.75       184\n          5       0.93      0.97      0.95       838\n\navg / total       0.92      0.92      0.92      1022\n'

In [None]:
# get the roc_auc score : 
# to get the roc_auc score i need to get the probabilities of the pred class : 

#Y_pred_mult_nb_prob = mult_nb.predict_proba(Y_pred_mult_nb)

In [37]:
# Let us use logistic regression 
from sklearn.linear_model import LogisticRegression

# instantiate the model 
logreg = LogisticRegression()

# Train the model using the document term matrices
%time logreg.fit(yelp_vect_df_train,Y_train)

Wall time: 670 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
# test the model : 
Y_pred_logreg = logreg.predict(yelp_vect_df_test)

In [39]:
metrics.accuracy_score(Y_test,Y_pred_logreg)

#0.9256360078277887

0.9256360078277887

In [40]:
# print the confusion matrix, from the accuracy score - logistic regression performs better : 
metrics.confusion_matrix(Y_test,Y_pred_logreg)

array([[140,  44],
       [ 32, 806]], dtype=int64)

In [113]:
#5-class classification problem

#Define X and y using the original DataFrame. (y should contain 5 different classes.)
#Split X and y into training and testing sets.
#Create document-term matrices using CountVectorizer.
#Calculate the testing accuracy of a Multinomial Naive Bayes model.
#Compare the testing accuracy with the null accuracy, and comment on the results.
#Print the confusion matrix, and comment on the results. (This Stack Overflow answer explains how to read a multi-class confusion matrix.)
#Print the classification report, and comment on the results. 
#If you are unfamiliar with the terminology it uses, research the terms, and then try to figure out how to 
#calculate these metrics manually from the confusion matrix!

In [114]:
yelp_final.columns

Index([u'business_id', u'date', u'review_id', u'stars', u'text', u'type',
       u'user_id', u'cool', u'useful', u'funny'],
      dtype='object')

In [3]:
# initialize the data : 
X = yelp_data.text
Y = yelp_data.stars

NameError: name 'yelp_data' is not defined

In [116]:
# initialize training and testing datasets : 
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=1)

In [117]:
# initialize a new countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

# instantiate new countvec
countvec_new = CountVectorizer()

# train the data 
countvec_new.fit(X_train)   

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [127]:
# get the feature names 
#countvec_new.get_feature_names()

# memory taken alot so the code and output commented. 


In [120]:
# now convert to get document term matrix for training and testing datasets 
X_train_dtm_2 = countvec_new.transform(X_train)

In [125]:
# look at the array
X_train_dtm_2.toarray(5)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [128]:
# get the pandas dataframe having column names as learnt vocab and rows representing the statements/text

X_train_dtm_df_2 = pd.DataFrame(X_train_dtm_2.toarray(), columns=countvec_new.get_feature_names())


In [129]:
X_train_dtm_df_2.shape

(7500, 25797)

In [133]:
# get dtm for test data
X_test_dtm_2=countvec_new.transform(X_test)

# create pandas df : 
X_test_dtm_df_2 = pd.DataFrame(X_test_dtm_2.toarray(), columns = countvec_new.get_feature_names())

X_test_dtm_df_2.head(5)


Unnamed: 0,00,000,007,00a,00am,00pm,01,02,04,05,...,zur,zuzu,zuzus,zweigel,zwiebel,zy,zzed,zzzzzzzzzzzzzzzzz,école,òc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# create the ML model 
from sklearn.naive_bayes import MultinomialNB

mult_nb_2 = MultinomialNB()

%time mult_nb_2.fit(X_train_dtm_df_2, Y_train)
# cannot run 5 class prediction problem - the computer hangs - too resource intensive. 

In [None]:
Y_pred_mult_nb_multiclass = mult_nb_2.predict(X_test_dtm_df_2)

NameError: name 'X_train_dtm_df_2' is not defined