In [24]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
train_data = pd.read_csv('movie_review_train.csv')
train_data.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [3]:
train_data.shape

(1600, 2)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   1600 non-null   object
 1   text    1600 non-null   object
dtypes: object(2)
memory usage: 25.1+ KB


In [5]:
train_data['class'] = train_data['class'].map({'Neg': 0, 'Pos': 1})

In [6]:
X_train = np.array(train_data['text'])
y_train = np.array(train_data['class'])

**Q1)** What is the size of vocabulary after removing the stop words? Note that the vocabulary size depends only on the training set.

In [7]:
cv = CountVectorizer(stop_words='english')
voc = cv.fit(X_train)
len(voc.vocabulary_)

35858

**Q2)** Suppose we don't want to consider those (rare) words which have appeared only in 3% of the documents, or say those (extremely common ones) which have appeared in 80% of the documents.

Use CountVectorizer(stop_words='english', min_df=.03, max_df=.8) to create a new vocabulary from the training set. What is the size of the new vocabulary?

In [8]:
cv = CountVectorizer(stop_words='english', min_df=0.03, max_df=0.8)
voc = cv.fit(X_train)
len(voc.vocabulary_)

1643

**Q3)** Suppose we build the vocabulary from the training data using CountVectorizer(stop_words='english', min_df=.03, max_df=.8) and then transform the test data using CountVectorizer(). How many nonzero entries are there in the sparse matrix (corresponding to the test data)? 

In [9]:
test_data = pd.read_csv('movie_review_test.csv')
test_data.head()

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...


In [10]:
test_data['class'] = test_data['class'].map({'Neg': 0, 'Pos': 1})

In [11]:
X_test = np.array(test_data['text'])
y_test = np.array(test_data['class'])

In [12]:
X_test_transform = cv.transform(X_test)

In [14]:
np.count_nonzero(X_test_transform.toarray())

51663

**Q4)** Train a Bernoulli Naive Bayes model on the training set and predict the classes of the test set. Each movie review in the test set has been labelled as 'Pos' or 'Neg'. What is the accuracy of the model?

In [16]:
X_train_transform = cv.transform(X_train)

In [17]:
bnb = BernoulliNB()
bnb.fit(X_train_transform, y_train)

In [20]:
y_test_pred = bnb.predict(X_test_transform)

In [23]:
accuracy_score(y_test, y_test_pred)

0.79

**Q5)** From confusion matrix, How many reviews are actually negative but have been classified as positive by the model?

In [25]:
confusion = confusion_matrix(y_test, y_test_pred)
confusion

array([[177,  23],
       [ 61, 139]], dtype=int64)