In [1]:
import pandas as pd
import numpy as np

In [2]:
reviews_sst = pd.read_csv('preprocessed_reviews_SST.csv')

In [3]:
reviews_sst[:4]

Unnamed: 0,Score,Summary,Text
0,5,perfect dark chocol experi,well dark chocol creami lavend absolut perfect...
1,5,fresh qualiti,swear cupcak ate fresh one local groceri store...
2,5,wrong,origin given 1 star sinc cake best shape got s...
3,5,cat like juic food,cat love sauc wet food feed product sauci winn...


In [4]:
dataset = pd.DataFrame()

In [5]:
dataset['smry_txt'] = reviews_sst['Summary'].astype(str) + ' ' + reviews_sst['Text']

In [6]:
dataset['score'] = reviews_sst['Score']

In [7]:
dataset.head()

Unnamed: 0,smry_txt,score
0,perfect dark chocol experi well dark chocol cr...,5
1,fresh qualiti swear cupcak ate fresh one local...,5
2,wrong origin given 1 star sinc cake best shape...,5
3,cat like juic food cat love sauc wet food feed...,5
4,list mislead cocoa butter found reason thought...,1


In [8]:
dataset.shape

(568454, 2)

In [9]:
#lets filter the dataset so that it contains reviews that are either positive(4 or 5) or negative(1 or 2)
dataset = dataset[dataset.score != 3]

In [10]:
print(dataset.shape)
print('{} rows has review as 3'.format(568454 - 525814))

(525814, 2)
42640 rows has review as 3


In [11]:
dataset['score'] = dataset['score'].apply({1:'negative', 2:'negative', 4:'positive', 5:'positive'}.get)

In [12]:
dataset.head()

Unnamed: 0,smry_txt,score
0,perfect dark chocol experi well dark chocol cr...,positive
1,fresh qualiti swear cupcak ate fresh one local...,positive
2,wrong origin given 1 star sinc cake best shape...,positive
3,cat like juic food cat love sauc wet food feed...,positive
4,list mislead cocoa butter found reason thought...,negative


## split the training(70%) and testing(30%) data

In [13]:
import random
def getTrainAndTestIndices(size, trn_pcnt = 0.70, tst_pcnt = 0.30):
    print("\n we have {} datapoints".format(size), end = ". ")
    train_size = int(size*trn_pcnt)
    test_size = size - train_size
    print("Out of which, we will get {} points in our training data and {} points in our testing data. \n".format(train_size, test_size))
    ind = list(range(size))
    random.shuffle(ind)
    return ind[:train_size], ind[train_size:]

In [14]:
 trn_ind, tst_ind = getTrainAndTestIndices(dataset.shape[0])


 we have 525814 datapoints. Out of which, we will get 368069 points in our training data and 157745 points in our testing data. 



In [15]:
len(trn_ind), len(tst_ind)

(368069, 157745)

In [16]:
train_data = dataset.iloc[trn_ind]
test_data = dataset.iloc[tst_ind]
train_data.shape, test_data.shape 

((368069, 2), (157745, 2))

In [17]:
train_data.head(2)

Unnamed: 0,smry_txt,score
480023,never drink oolong tea cost plu world market t...,negative
277261,like homemad tast almost good real thing textu...,positive


## creating word count Sparse vector representation of Training data 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

In [19]:
word_counts = count_vectorizer.fit_transform(train_data['smry_txt'].values)

In [20]:
type(word_counts), word_counts

(scipy.sparse.csr.csr_matrix,
 <368069x70671 sparse matrix of type '<class 'numpy.int64'>'
 	with 12282365 stored elements in Compressed Sparse Row format>)

In [21]:
# lets see how sparse matrix looks like
print(word_counts[2])

  (0, 53223)	1
  (0, 11945)	1
  (0, 51911)	1
  (0, 30991)	1
  (0, 28910)	1
  (0, 8542)	1
  (0, 55572)	1
  (0, 17132)	1
  (0, 10893)	1
  (0, 40745)	1
  (0, 65541)	1
  (0, 54244)	1
  (0, 28246)	1
  (0, 64650)	1
  (0, 66165)	1
  (0, 59651)	1
  (0, 29172)	1
  (0, 60558)	1
  (0, 2425)	1
  (0, 43863)	1
  (0, 49712)	2
  (0, 13264)	1
  (0, 28537)	1
  (0, 3084)	1
  (0, 15894)	1
  :	:
  (0, 18635)	2
  (0, 67679)	1
  (0, 40878)	1
  (0, 50586)	1
  (0, 45410)	2
  (0, 36510)	1
  (0, 47068)	1
  (0, 14546)	1
  (0, 18439)	1
  (0, 12304)	1
  (0, 37394)	1
  (0, 55523)	1
  (0, 19102)	1
  (0, 5990)	1
  (0, 46169)	1
  (0, 14125)	2
  (0, 24468)	3
  (0, 47323)	3
  (0, 16737)	5
  (0, 52297)	1
  (0, 28372)	1
  (0, 28189)	1
  (0, 8111)	3
  (0, 47367)	3
  (0, 27576)	1


In [22]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
x_test, y_test = test_data['smry_txt'].values, test_data['score'].values

In [25]:
multi_nb_classifier = MultinomialNB()
multi_nb_classifier.fit(X=word_counts, y=train_data['score'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
pred = multi_nb_classifier.predict(count_vectorizer.transform(x_test))

In [35]:
pred[:10], y_test[:10]

(array(['positive', 'positive', 'positive', 'positive', 'positive',
        'negative', 'positive', 'positive', 'positive', 'positive'], 
       dtype='<U8'),
 array(['positive', 'positive', 'positive', 'positive', 'positive',
        'negative', 'positive', 'positive', 'positive', 'positive'], dtype=object))

In [31]:
accuracy = sum(pred == np.array(y_test)) / len(pred)
print('accuracy : {}'.format(accuracy))

accuracy : 0.9182668230371802


In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

   negative       0.73      0.75      0.74     24588
   positive       0.95      0.95      0.95    133157

avg / total       0.92      0.92      0.92    157745



In [39]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_pred=pred, y_true=y_test, average='micro')

(0.91826682303718021, 0.91826682303718021, 0.91826682303718021, None)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_pred=pred, y_true=y_true, average='micro')

In [None]:
\