# Estimating Ratings from Reviews Using Naive Bayes Classifier

In [26]:
# Read the csv file
import pandas as pd
df_orig = pd.read_csv("Reviews.csv",engine='python')

In [27]:
df_orig.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


## (a) NB Classifier with Unigrams

Each review is considered as a bag of words and a NB classifier is trained to predict the rating of the review. As the distribution of the ratings is not uniform, we use class weights to avoid the skew in the rating estimation.

In [48]:
# Selct the Score and Text(review) columns from the whole data
df = df_orig[['Score', 'Text']]
# Remove the rows with no review
df = df[pd.notnull(df['Text'])]
# Find the number of samples for each score
print(df.groupby('Score').count())
df_final = df

         Text
Score        
1       52268
2       29769
3       42640
4       80655
5      363122


In [49]:
#Find the sample count in each column 
weights = df.groupby('Score').count()
# As the samples for score 5 are higher we use this sample count to normalize and determine sample weights
class_weights =weights.loc[5,'Text']/ weights['Text']
# Add a new column 'weight' for each sample/row
def add_weight(row):
    if row['Score'] == 1:
        return class_weights.loc[1]
    elif row['Score'] == 2:
        return class_weights.loc[2]
    elif row['Score'] == 3:
        return class_weights.loc[3]
    elif row['Score'] == 4:
        return class_weights.loc[4]
    else:
        return class_weights.loc[5]
df_final['weights'] = df_final.apply(lambda row: add_weight(row), axis=1)

In [57]:
class_weights

Score
1     6.947310
2    12.197991
3     8.515994
4     4.502164
5     1.000000
Name: Text, dtype: float64

In [50]:
# Import the required libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
# Split the data into train and test
X_train, X_test, y_train, y_test, W_train, W_test= train_test_split(df_final['Text'], df_final['Score'].astype(float), df_final['weights'], random_state = 0)
# Find the tfidf vector representation for the train data
count_vect = CountVectorizer(min_df=20,stop_words='english')
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Train a MultinomialNB classifier for the train data
clf = MultinomialNB().fit(X_train_tfidf, y_train, sample_weight= W_train)

In [51]:
# Find the tfidf representation of the test data
xtest = count_vect.transform(X_test)
xidftext = tfidf_transformer.transform(xtest)
# Predict the ratings for the test data
y_pred = clf.predict(xidftext)
from sklearn.metrics import confusion_matrix
# generate a confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for the predicted samples')
conf_mat

Confusion Matrix for the predicted samples


array([[ 8563,  2464,  1038,   406,   504],
       [ 1711,  3361,  1493,   621,   435],
       [ 1445,  1714,  4522,  1833,  1023],
       [ 1424,  1328,  3212,  8980,  5400],
       [ 4555,  3339,  5489, 16528, 60726]])

In [52]:
# Calculate Performance Metrics
# Calculate accuracy
corr_pred = 0
for i in range(5):
    corr_pred += conf_mat[i][i]
accuracy = corr_pred/float(conf_mat.sum())
# Calculate Mean Prediction Error 
y_true = y_test.to_numpy()
d = {'y_true': y_true, 'y_pred': y_pred}
results = pd.DataFrame(data=d)
def find_diff(row):
    return abs(row['y_pred']-row['y_true'])
results['diff'] = results.apply(lambda row:find_diff(row), axis = 1)
mean = results['diff'].mean()
variance = results['diff'].var()
# Find Precision & Recall
from sklearn.metrics import precision_recall_fscore_support
average_precision = precision_recall_fscore_support(y_test, y_pred, average='macro')

In [53]:
print 'Performace Metrics'
print 'Accuracy = ', accuracy
print 'Mean Prediction Error =', results['diff'].mean()
print 'Variance of prediction Error = ', results['diff'].var()
print 'Precision = ', average_precision[0]
print 'Recall =', average_precision[1]

Performace Metrics
Accuracy =  0.6062175436621304
Mean Prediction Error = 0.656451862589
Variance of prediction Error =  1.0433243214
Precision =  0.4509327320709585
Recall = 0.5283066363397099


## (b) NB Classifier with Unigrams & Bigrams

In [54]:
# Find the vector representation by representing each review as bag of unigrams(single words) \
# and bigrams(pairs of adjacent word)
count_vect = CountVectorizer(min_df=20,stop_words='english', ngram_range=(1, 2))
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Train a MultinomialNB classifier for the train data
clf = MultinomialNB().fit(X_train_tfidf, y_train, sample_weight= W_train)
# Find the tfidf representation of the test data
xtest = count_vect.transform(X_test)
xidftext = tfidf_transformer.transform(xtest)
# Predict the ratings for the test data
y_pred = clf.predict(xidftext)
# generate a confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for the prediction samples')
print conf_mat

Confusion Matrix for the prediction samples
[[ 9591  1672   922   378   412]
 [ 1492  3979  1250   524   376]
 [ 1134  1234  5660  1668   841]
 [ 1035   868  2312 11123  5006]
 [ 3150  1972  3807 13711 67997]]


In [55]:
# Calculate Performance Metrics
# Calculate accuracy
corr_pred = 0
for i in range(5):
    corr_pred += conf_mat[i][i]
accuracy = corr_pred/float(conf_mat.sum())
# Calculate Mean Prediction Error 
y_true = y_test.to_numpy()
d = {'y_true': y_true, 'y_pred': y_pred}
results = pd.DataFrame(data=d)
def find_diff(row):
    return abs(row['y_pred']-row['y_true'])
results['diff'] = results.apply(lambda row:find_diff(row), axis = 1)
mean = results['diff'].mean()
variance = results['diff'].var()
# Find Precision & Recall
from sklearn.metrics import precision_recall_fscore_support
average_precision = precision_recall_fscore_support(y_test, y_pred, average='macro')

In [56]:
print 'Performace Metrics'
print 'Accuracy = ', accuracy
print 'Mean Prediction Error =', results['diff'].mean()
print 'Variance of prediction Error = ', results['diff'].var()
print 'Precision = ', average_precision[0]
print 'Recall =', average_precision[1]

Performace Metrics
Accuracy =  0.6920500443306078
Mean Prediction Error = 0.493040798232
Variance of prediction Error =  0.823454696964
Precision =  0.5433179771015136
Recall = 0.6190827707312356


# Estimating Ratings from Review Summary Using Naive Bayes Classifier

In [44]:
# Read the csv file which has summary of the reviews
import pandas as pd
df = pd.read_csv("Reviews_summaries.csv",engine='python')
df.sample(frac=1)
#df = df.sample(frac = 0.1, replace= False)
df.head()
df = df[['Score', 'predicted_summary']]
df = df[pd.notnull(df['predicted_summary'])]
df_final = df
#Find the sample count in each column and find class weights w.r.t 5 class
weights = df_final.groupby('Score').count()
class_weights =weights.loc[5,'predicted_summary']/ weights['predicted_summary']
# Add a new column 'weight' for each sample/row
def add_weight(row):
    if row['Score'] == 1:
        return class_weights.loc[1]
    elif row['Score'] == 2:
        return class_weights.loc[2]
    elif row['Score'] == 3:
        return class_weights.loc[3]
    elif row['Score'] == 4:
        return class_weights.loc[4]
    else:
        return class_weights.loc[5]
# Import the required libraries
df_final['weights'] = df_final.apply(lambda row: add_weight(row), axis=1)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
# Split the data into train and test
X_train, X_test, y_train, y_test, W_train, W_test= train_test_split(df_final['predicted_summary'], \
                                                                    df_final['Score'].astype(float), \
                                                                    df_final['weights'], \
                                                                    random_state = 0)
# Find the vector representation by representing each review summary as bag of unigrams(single words) \
# and bigrams(pairs of adjacent word)
count_vect = CountVectorizer(min_df=20,stop_words='english', ngram_range=(1, 2))
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Train a MultinomialNB classifier for the train data
clf = MultinomialNB().fit(X_train_tfidf, y_train, sample_weight= W_train)
# Find the tfidf representation of the test data
xtest = count_vect.transform(X_test)
xidftext = tfidf_transformer.transform(xtest)
# Predict the ratings for the test data
y_pred = clf.predict(xidftext)
# generate a confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for the predicted samples')
print conf_mat

Confusion Matrix for the predicted samples
[[ 7415  1789  1249   401  2121]
 [ 2647  1388  1525   448  1613]
 [ 2389  1144  3062  1032  2910]
 [ 1717  1013  4146  2962 10506]
 [ 4422  2466  5496  7833 70420]]


In [45]:
# Calculate Performance Metrics
# Calculate accuracy
corr_pred = 0
for i in range(5):
    corr_pred += conf_mat[i][i]
accuracy = corr_pred/float(conf_mat.sum())
# Calculate Mean Prediction Error 
y_true = y_test.to_numpy()
d = {'y_true': y_true, 'y_pred': y_pred}
results = pd.DataFrame(data=d)
def find_diff(row):
    return abs(row['y_pred']-row['y_true'])
results['diff'] = results.apply(lambda row:find_diff(row), axis = 1)
mean = results['diff'].mean()
variance = results['diff'].var()
# Find Precision & Recall
from sklearn.metrics import precision_recall_fscore_support
average_precision = precision_recall_fscore_support(y_test, y_pred, average='macro')

In [47]:
print 'Performace Metrics'
print 'Accuracy = ', accuracy
print 'Mean Prediction Error =', results['diff'].mean()
print 'Variance of prediction Error = ', results['diff'].var()
print 'Precision = ', average_precision[0]
print 'Recall =', average_precision[1]

Performace Metrics
Accuracy =  0.5998494166654939
Mean Prediction Error = 0.720513109194
Variance of prediction Error =  1.20556219476
Precision =  0.36249497146719023
Recall = 0.39334962972340304
