In [1]:
import pandas as pd
import numpy as np
import sklearn

In [4]:
# Data
train_imdb_df = pd.read_csv("movie_review_train.csv")

In [6]:
train_imdb_df.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [9]:
train_imdb_df['class'].value_counts()

Neg    800
Pos    800
Name: class, dtype: int64

In [10]:
# mapping class to 0 and 1
train_imdb_df['label'] = train_imdb_df['class'].map({'Neg':0, 'Pos':1})

In [11]:
train_imdb_df.head()

Unnamed: 0,class,text,label
0,Pos,a common complaint amongst film critics is ...,1
1,Pos,whew this film oozes energy the kind of b...,1
2,Pos,steven spielberg s amistad which is bas...,1
3,Pos,he has spent his entire life in an awful litt...,1
4,Pos,being that it is a foreign language film with...,1


In [12]:
# we can now drop the column 'Class'
train_imdb_df = train_imdb_df.drop('class', axis=1)
train_imdb_df.head()

Unnamed: 0,text,label
0,a common complaint amongst film critics is ...,1
1,whew this film oozes energy the kind of b...,1
2,steven spielberg s amistad which is bas...,1
3,he has spent his entire life in an awful litt...,1
4,being that it is a foreign language film with...,1


In [21]:
# convert to X and y
X_train = train_imdb_df.text
y_train = train_imdb_df.label
print(X_train.shape)
print(y_train.shape)

(1600,)
(1600,)


In [22]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect1 = CountVectorizer(stop_words='english')

In [23]:
vect1.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [24]:
# vocab size
len(vect1.vocabulary_.keys())

35858

In [25]:
vect2 = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)

In [26]:
vect2.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.03,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
# vocab size
len(vect2.vocabulary_.keys())

1643

In [29]:
vect3 = CountVectorizer()

In [31]:
vect3.fit(X_train)
# vocab size
len(vect3.vocabulary_.keys())

36162

In [59]:
# transforming the train and test datasets
#X_train_transformed = vect3.transform(X_train)
# Test data set 
# Data
test_imdb_df = pd.read_csv("movie_review_test.csv")

# mapping class to 0 and 1
test_imdb_df['label'] = test_imdb_df['class'].map({'Neg':0, 'Pos':1})

# we can now drop the column 'Class'
test_imdb_df = test_imdb_df.drop('class', axis=1)
print(test_imdb_df.head())

# convert to X and y
X_test = test_imdb_df.text
y_test = test_imdb_df.label
print(X_test.shape)
print(y_test.shape)

X_train_transformed = vect3.transform(X_train)
X_test_transformed = vect3.transform(X_test)
X_test_transformed

                                                text  label
0   films adapted from comic books have had plent...      1
1   every now and then a movie comes along from a...      1
2   you ve got mail works alot better than it des...      1
3      jaws   is a rare film that grabs your atte...      1
4   moviemaking is a lot like being the general m...      1
(400,)
(400,)


<400x36162 sparse matrix of type '<class 'numpy.int64'>'
	with 125914 stored elements in Compressed Sparse Row format>

In [60]:
print(type(X_test_transformed))
#print(X_test_transformed)

<class 'scipy.sparse.csr.csr_matrix'>


In [61]:
# converting matrix to dataframe
sparse_matrix = pd.DataFrame(X_test_transformed.toarray(), 
             columns=vect3.get_feature_names())
#sparse_matrix

In [56]:
#np.sum(sparse_matrix)

In [64]:
from sklearn.naive_bayes import BernoulliNB

# instantiate bernoulli NB object
bnb = BernoulliNB()

# fit 
bnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = bnb.predict(X_test_transformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_transformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


0.785

In [65]:
bnb

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [66]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[184,  16],
       [ 70, 130]], dtype=int64)

In [70]:
# building a multinomial NB model
from sklearn.naive_bayes import MultinomialNB

# instantiate NB class
mnb=MultinomialNB()

# fitting the model on training data
mnb.fit(X_train_transformed, y_train)

# note that we are using the sparse matrix X_transformed, 
# though you can also use the non-sparse version
# mnb.fit(X_transformed.toarray(), y_train) 

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.8075

In [71]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[164,  36],
       [ 41, 159]], dtype=int64)