### Importing Libraries & Loading Data

In [7]:
import numpy as np
import pandas as pd
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

df = pd.read_csv('binarized_reviews.csv')
df.sample(5)

Unnamed: 0,reviewText,reviewPositive
125640,My nephew loves them.,1
506030,They aren't very durable.,1
92459,Great shoes. Comfortable as soon as you put th...,1
463213,I got these in need of shoes to lounge around ...,1
460421,This is my third pair of these. I love the fit...,1


### Train-Test Split

In [8]:
features_train, features_test, labels_train, labels_test = train_test_split(df['reviewText'], df['reviewPositive'], test_size=0.3, random_state=42)

### Vectorizing the reviews using CountVectorizer

In [9]:
# Creating a vocabulary (a dictionary of all unique words in the features_train data)
vectorizer = CountVectorizer().fit(features_train)

print('Length of the vocabulary:',len(vectorizer.get_feature_names()), 'words')

# vectorizer.get_feature_names()[20000:20010]

features_train_vectorized = vectorizer.transform(features_train)

Length of the vocabulary:  63967 words


Displaying a single complete vectorized feature input. We can see the bag of words model, set of numbers (frequency) of words.

In [10]:
numpy.set_printoptions(threshold=sys.maxsize)

# features_train_vectorized[1].toarray()

# Changing back the threshold setting
# numpy.set_printoptions(threshold=None)

In [11]:
print('Shape of features_train_vectorized: ', features_train_vectorized.shape)

Shape of features_train_vectorized:  (369259, 63967)


#### Modeling, Prediction & Calculating Accuracy

In [27]:
model = LogisticRegression(C = 0.01, max_iter=150)
model.fit(features_train_vectorized, labels_train)

predictions = model.predict(vectorizer.transform(features_test))

confusion_matrix(labels_test, predictions)

roc_auc_score(labels_test, predictions)

0.8918943739250909

In [13]:
len(features_test)

158255

### Calculating Tf-idf score for a word in document.

 - TF $\Rightarrow$ Term frequency
 - IDF $\Rightarrow$ Inverse Document Frequency
 
 $Tf-idf = Term\;Frequency \times Inverse-Document\;Frequency $ 
 
 where
 
 $Term\;Frequency (TF)=\frac{Count\;of\;that\;word\;appears\;in\;a\;document}{Length\;of\;that\;document} $
 
 $Term\;Frequency = \frac{Count(Word)}{Length(Document)}$
 
 $IDF = \frac{Total\;number\;of\;documents}{Number\;of\;documents\;that\;contain\;the\;word} $


 $$ TF-IDF\;Score = \frac{Count(Word)}{Length(Document)} \times log\Bigg ( \frac{Total\;number\;of\;documents}{Number\;of\;documents\;that\;contain\;the\;word}\Bigg ) $$

 Higher the Tf-idf score, more relevant that word in that document. 
 
 One review $\Rightarrow$ One document

### Vectorizing using Tf-idf Vectorizer

In [14]:
# min_df=5 --> We ignore the words that are repeated in the document >= 5 times. 
vect = TfidfVectorizer(min_df=5, decode_error='replace').fit(features_train)

# Here, we get less features than before. This will lead us to have a smaller sparse matrix.

In [15]:
len(vect.get_feature_names())

18694

- If you're using scikit-learn version <= 0.22.x, you will get an error:
    `Attributeerror: str' object has no attribute 'decode'` at line 7. 
- Updating the scikit-learn (version=0.24.1) would fix the problem.

In [16]:
import sklearn
sklearn.__version__

'0.24.1'

In [21]:
features_train_vectorized_tfidf = vect.transform(features_train)

# The Logistic Regression is now using the data that is generated by Tf-idf vectorizer.

model = LogisticRegression(max_iter=200)

model.fit(features_train_vectorized_tfidf, labels_train)

predictions = model.predict(vect.transform(features_test))

#### Calculating Model Accuracy

In [22]:
confusion_matrix(labels_test, predictions)

roc_auc_score(labels_test, predictions)

0.9022737119553056