In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist

dataset = pd.read_csv('kindle_reviews.csv', na_filter=False)
newdf = dataset[:10000]

In [20]:
newdf.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [21]:
newdf.columns

Index(['Unnamed: 0', 'asin', 'helpful', 'overall', 'reviewText', 'reviewTime',
       'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'],
      dtype='object')

In [22]:
newdf.dtypes

Unnamed: 0         int64
asin              object
helpful           object
overall            int64
reviewText        object
reviewTime        object
reviewerID        object
reviewerName      object
summary           object
unixReviewTime     int64
dtype: object

In [23]:
print ("Shape of the dataset - ", newdf.shape)
#check for the missing values
newdf.apply(lambda x: sum(x.isnull()))

Shape of the dataset -  (10000, 10)


Unnamed: 0        0
asin              0
helpful           0
overall           0
reviewText        0
reviewTime        0
reviewerID        0
reviewerName      0
summary           0
unixReviewTime    0
dtype: int64

In [24]:
print ("Shape of the dataset - ", newdf.shape)
#check for the missing values
newdf.apply(lambda x: sum(x.isnull()))

Shape of the dataset -  (10000, 10)


Unnamed: 0        0
asin              0
helpful           0
overall           0
reviewText        0
reviewTime        0
reviewerID        0
reviewerName      0
summary           0
unixReviewTime    0
dtype: int64

In [25]:
newdf['overall'].value_counts()

5    4631
4    2963
3    1403
2     578
1     425
Name: overall, dtype: int64

In [26]:
# Remove neutral rated
newdf = newdf[newdf['overall'] != 3]
newdf['Positively Rated'] = np.where(newdf['overall'] > 3, 1, 0)

# 22 rows from reviewText are blank. Lets add sample review for it
#newdf['reviewText']=newdf['reviewText'].fillna("No Review", inplace=True)
#newdf = newdf.replace(np.nan, '', regex=True)
#newdf.apply(lambda x: sum(x.isnull()))
#print (newdf['reviewText'].head(10))

In [27]:
#Number of rating which are positively rated 
newdf['Positively Rated'].mean()

0.8833313946725602

In [28]:
from  sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(newdf['reviewText'],newdf['Positively Rated'], random_state=0)
print('X_train first entry: ', X_train.iloc[1])
print('\nX_train shape: ', X_train.shape)

X_train first entry:  There's a lot of misspellings & it takes a while for the book to really get started. The book gets pretty good about 30% in & it has a decent ending. Its worth the read.

X_train shape:  (6447,)


In [29]:
from  sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from  sklearn.metrics import roc_auc_score

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())
# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

AUC:  0.7596766584690579
Smallest Coefs:
['idea' 'waste' 'delete' 'weak' 'deleted' 'not' 'pay' 'whole' 'violent'
 'nothing']

Largest Coefs: 
['enjoyed' 'loved' 'fun' 'wait' 'overall' 'well' 'hot' 'mind' 'erotic'
 'good']


In [30]:
df = newdf.groupby('asin', as_index=False).agg({'Positively Rated': 'sum'})
#df.sort_values(by=['Positively Rated'], ascending=False)
print ("PRODUCT HAVING THE LARGEST POSTIVE RATING - ",df.loc[df['Positively Rated'].idxmax()][0])

PRODUCT HAVING THE LARGEST POSTIVE RATING -  B000JMLBHU


In [33]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [34]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(8597,)
(8597,)


In [35]:
# Summarize number of classes
print("Classes: ")
print(np.unique(y))

Classes: 
[0 1]


In [36]:
# Summarize number of words
print("Number of words: ")
print(len(np.unique(np.hstack(X))))

Number of words: 
8597


In [None]:
# Summarize review length
print("Review length: ")
result = map(len, X)
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length as a boxplot and histogram
pyplot.subplot(121)
pyplot.boxplot(result)
pyplot.subplot(122)
pyplot.hist(result)
pyplot.show()