In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing all the important libraries for the dataset
%matplotlib inline
import pandas as pd
import nltk
import sqlite3
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


In [None]:
review = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')
review.head()

In [None]:
print("The number of entries from the dataframe:",review.shape[0])

In [None]:
review['ProductId'].nunique()

In [None]:
review['UserId'].nunique()

Check for the Null Values

In [None]:
review.isnull().sum()

In [None]:
#drop the values with the null values
review.dropna(inplace=True)

In [None]:
review.isnull().sum()

NEUTRAL REVIEWS
> we drop the rows where score = 3 because neutral reviews don't provide value to the prediction.

In [None]:
review = review[review['Score'] !=3]

TARGET VARIABLE
> next we create a column called positive where any score above 3 is encoded as 1 otherwise 0.


In [None]:
review['positive']=np.where(review["Score"]>3,1,0)
review.head()

In [None]:
sns.countplot(review['positive'])
plt.show()

MEMORY USAGE

In [None]:
review.info(memory_usage='deep')

LOW MEMORY
> drop down the columns

In [None]:
review=review.drop(['ProductId','UserId','ProfileName','Id','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time','Summary'],axis=1)

In [None]:
#checking the memory usage again
review.info(memory_usage='deep')

In [None]:
#split the data into training and testing data.
#text will be used for training.
#positive is what we are predicting.
x_train,x_test,y_train,y_test=train_test_split(review['Text'],review['positive'],random_state=0)

In [None]:
print('x_train first entry: \n\n',x_train[0])
print('\n\nx_train shape:',x_train.shape)

TOKENIZATON
> In order to perform machine learning on text documents,we first need to turn these text content into numerical feature vectors that scikit-Learn can use.

BAG OF WORDS
> The simplest way to do so is to use bags-of-words.First we convert the text documentation into a matrix of tokens.
The default configuration tokenizes the string,by extracting words of at least 2 letters or numbers,
seperated by word boundaries,converts everything to lowercase and builds a vocabulary using these tokens

In [None]:
vect = CountVectorizer().fit(x_train)
vect

In [None]:
#checking the features
feat=vect.get_feature_names()

In [None]:
cloud=WordCloud(width=1440, height=1080).generate(" ".join(feat))

In [None]:
# larger the size of the word, more the times it appear.
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

 Sparse Matrix
> we now transform the documents into bag-of-words representation i.e matrix form. The result is stored in a sparse matrix i.e it has very few non zero elements.
> Rows represent a word in a document while columns represent the words in our training vocabulary.

In [None]:
x_train_vectorized=vect.transform(x_train)
# the interpretation of the columns can be retreived as follows
# X_train_vectorized .toarray()

In [None]:
model=LogisticRegression()
model.fit(x_train_vectorized, y_train)

In [None]:
#accuracy
predictions=model.predict(vect.transform(x_test))

In [None]:
accuracy_score(y_test,predictions)

In [None]:
# area under the curve.
roc_auc=roc_auc_score(y_test,predictions)
print('AUC:',roc_auc)
fpr,tpr,thresholds=roc_curve(y_test,predictions)

In [None]:
plt.title('ROC for logistic regression on bag of words',fontsize=20)
plt.plot(fpr,tpr,'b',label='AUC= %0.2f'%roc_auc)
plt.plot([0,1], [0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True positive rate',fontsize=20)
plt.xlabel('False negative rate',fontsize=20)
plt.legend(loc='lower right')
plt.show()

In [None]:
# coefficient determine the weight of a word (positive or negative)
# checking the top 10 positive and negative words

#getting the feature names
feature_names=np.array(vect.get_feature_names())

#argsort: Integer indicies that would sort the index if used as an indexer
sorted_coef_index=model.coef_[0].argsort()

print('Smallest coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))


TF IDF(term-frequency-inverse-document-frequency).
> This means that we weigh the terms by how uncommon they are, meaning that we care more about rare words than common words.



Why use TF IDF over bag of words?
> In large texts,some words may be repeated often but will cary very little meaningful information about the actual contents of the document. If we were to feed the count data directly to a classifier those very frequent terms would shadow the frequencies of rare yet more interesting terms.


TF IDF allows us to weight terms based on how important they are to a document.

In [None]:
# Ignore the terms that appear in less than 5 documents
vect= TfidfVectorizer(min_df=5).fit(x_train)
len(vect.get_feature_names())

In [None]:
# check the top 10 features for positive and negative
# reviews again, the AUC has improved
feature_names=np.array(vect.get_feature_names())
sorted_coef_index=model.coef_[0].argsort()

# print('Smallest coef: \n{}\n'.format(feature_names[sorted_coef_index][:10]))
# print('Largest coef: \n{}\n'.format(feature_names[sorted_coef_index][:11:-1]))

In [None]:
feat=vect.get_feature_names()

In [None]:
cloud=WordCloud(width=1440,height=1080).generate(" ".join(feat))

In [None]:
# larger the size of the word more the times it appears
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
x_train_vectorized=vect.transform(x_train)

In [None]:
model=LogisticRegression()
model.fit(x_train_vectorized,y_train)

In [None]:
predictions=model.predict(vect.transform(x_test))

In [None]:
accuracy_score(y_test, predictions)

In [None]:
roc_auc=roc_auc_score(y_test, predictions)
print('AUC:',roc_auc)
fpr,tpr,thresholds=roc_curve(y_test, predictions)

In [None]:
plt.title('ROC for logistic regressio on TF-IDF',fontsize=25)
plt.plot([0,1], [0,1],'r--')
plt.plot(fpr,tpr,'b',label='AUC = %0.2f' %roc_auc)
plt.legend(loc="lower right")
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True positive rate',fontsize=20)
plt.xlabel('False positive rate',fontsize=20)
plt.show()

In [None]:
# even tho we reduced the number of features considerably
# AUC did not change much

# let us test our model
new_review=['The food was delicious','The food was not good']
print(model.predict(vect.transform(new_review)))

Bigrams
> Since our classifier missclassifies things like 'not good', we will use bag of words instead of single words. This method is called n grams. Here we take 1 and 2 words into consideration.

In [None]:
vect=CountVectorizer(min_df=5, ngram_range=(1,2)).fit(x_train)
x_train_vactorized=vect.transform(x_train)
len(vect.get_feature_names())

In [None]:
feat=vect.get_feature_names()

In [None]:
cloud=WordCloud(width=1440, height=1080).generate(" ".join(feat))

In [None]:
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
# The number of feature has increased again.
# checking for the AUC
model=LogisticRegression()
model.fit(x_train_vactorized, y_train)

In [None]:
predictions=model.predict(vect.transform(x_test))

In [None]:
accuracy_score(y_test, predictions)

In [None]:
roc_auc=roc_auc_score(y_test, predictions)
print('AUC:',roc_auc)
fpr,tpr,thresholds=roc_curve(y_test, predictions)

In [None]:
plt.title('ROC for logistic Regression on Bigrams',fontsize=20)
plt.plot(fpr,tpr,'b', label= 'AUC=%0.2f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True positive rate',fontsize=20)
plt.xlabel('False positive rate',fontsize=20)
plt.show()