In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist

dataset = pd.read_csv('kindle_reviews.csv', na_filter=False)
newdf = dataset[:10000]

In [4]:
newdf.head()

Unnamed: 0,overall,reviewText
0,5,I enjoy vintage books and movies so I enjoyed ...
1,4,This book is a reissue of an old one; the auth...
2,4,This was a fairly interesting read. It had ol...
3,5,I'd never read any of the Amy Brewster mysteri...
4,4,"If you like period pieces - clothing, lingo, y..."


In [5]:
newdf.columns

Index(['overall', 'reviewText'], dtype='object')

In [6]:
newdf.dtypes

overall        int64
reviewText    object
dtype: object

In [7]:
print ("Shape of the dataset - ", newdf.shape)
#check for the missing values
newdf.apply(lambda x: sum(x.isnull()))

Shape of the dataset -  (10000, 2)


overall       0
reviewText    0
dtype: int64

In [8]:
print ("Shape of the dataset - ", newdf.shape)
#check for the missing values
newdf.apply(lambda x: sum(x.isnull()))

Shape of the dataset -  (10000, 2)


overall       0
reviewText    0
dtype: int64

In [9]:
newdf['overall'].value_counts()

5    4631
4    2963
3    1403
2     578
1     425
Name: overall, dtype: int64

In [10]:
# Remove neutral rated
newdf = newdf[newdf['overall'] != 3]
newdf['Positively Rated'] = np.where(newdf['overall'] > 3, 1, 0)

# 22 rows from reviewText are blank. Lets add sample review for it
#newdf['reviewText']=newdf['reviewText'].fillna("No Review", inplace=True)
#newdf = newdf.replace(np.nan, '', regex=True)
#newdf.apply(lambda x: sum(x.isnull()))
#print (newdf['reviewText'].head(10))

In [12]:
#Number of rating which are positively rated 
newdf['Positively Rated'].mean()

0.8833313946725602

In [13]:
from  sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(newdf['reviewText'],newdf['Positively Rated'], random_state=0)
print('X_train first entry: ', X_train.iloc[1])
print('\nX_train shape: ', X_train.shape)

X_train first entry:  There's a lot of misspellings & it takes a while for the book to really get started. The book gets pretty good about 30% in & it has a decent ending. Its worth the read.

X_train shape:  (6447,)


In [15]:
from  sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from  sklearn.metrics import roc_auc_score

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())
# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

AUC:  0.7596766584690579
Smallest Coefs:
['idea' 'waste' 'delete' 'weak' 'deleted' 'not' 'pay' 'whole' 'violent'
 'nothing']

Largest Coefs: 
['enjoyed' 'loved' 'fun' 'wait' 'overall' 'well' 'hot' 'mind' 'erotic'
 'good']


In [17]:
df = newdf.groupby('asin', as_index=False).agg({'Positively Rated': 'sum'})
#df.sort_values(by=['Positively Rated'], ascending=False)
print ("PRODUCT HAVING THE LARGEST POSTIVE RATING - ",df.loc[df['Positively Rated'].idxmax()][0])

KeyError: 'asin'

In [None]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [None]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

In [None]:
# Summarize number of classes
y = np.concatenate((y_train, y_test), axis=0)
print("Classes: ")
print(np.unique(y))

In [None]:
# Summarize number of words
print("Number of words: ")
print(len(np.unique(np.hstack(X))))

In [None]:
# Summarize review length
print("Review length: ")
result = map(len, X)
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length as a boxplot and histogram
pyplot.subplot(121)
pyplot.boxplot(result)
pyplot.subplot(122)
pyplot.hist(result)
pyplot.show()