In [1]:
import pandas as pd
import numpy as np

In [2]:
#Opening the csv file
df = pd.read_csv('movie_reviews_train.csv')

In [3]:
df.head()

Unnamed: 0,sentiment,review
0,1,"To an entire generation of filmgoers, it just ..."
1,1,Pixar classic is one of the best kids' movies ...
2,1,Apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's..."
4,1,Introduced not one but two indelible character...


In [4]:
df.sentiment.value_counts()

1    89658
0    62952
Name: sentiment, dtype: int64

In [5]:
df.loc[1000,'review']

'Poignant and pertinent, perhaps, but ultimately somewhat self-satisfied.'

In [6]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

# \W =matches any non-alphanumeric character;
# \D = matches any non-digit character
#<[^>]*> :all tag  ex..<br />, <a>
# [^>] :except '>'

In [8]:
#Testing the function preprocessor()
#t = input('Enter some text')
#preprocessor(t)

In [7]:
df['review'] = df['review'].apply(preprocessor)

In [10]:
print(df['review'])

0        this is one of those unfortunate films that su...
1        okay maybe it was because i happen to be in ya...
2        although i love this movie i can barely watch ...
3        a man arrives in a strange beautiful sterile c...
4        i m sitting around going through movie listing...
5        i really enjoyed this i got it thinking it was...
6        some funny lines are all what makes this movie...
7        context is everything when one goes to rate a ...
8        this is what happens when you try to adapt a p...
9        this is not horror as the first part was this ...
10       mr baseball was a fun video rental with my fia...
11       dr marnie bannister magda konopka is a horribl...
12       i m lucky enough to have a good quality copy o...
13       kurosawa really blew it on this one every geni...
14        extreme makeover home edition is yet another ...
15       this movie is a real gem the arc of the the pl...
16       little mosque is one of the most boring cbc co.

In [8]:
#Splitting the dataset into training and testing sets
X_train = df.loc[:2500,'review'].values
y_train = df.loc[:2500,'sentiment'].values
X_test = df.loc[2500:5000,'review'].values
y_test = df.loc[2500:5000,'sentiment'].values

In [14]:
#print(np.bincount(y_test))
#print(np.unique(y_test))
#print(np.bincount(y_train))

In [9]:
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [10]:
kf = KFold(n_splits=5,shuffle=True,random_state=10)
tfidf = TfidfVectorizer(stop_words='english')
params = {'clf__C':[1.0,10,11,15]}

In [11]:
pipe = Pipeline((['vect',tfidf],['clf',LogisticRegression()]))

In [12]:
grid = GridSearchCV(pipe,param_grid=params,cv=kf,verbose=0,scoring='accuracy')

In [13]:
#Checking the steps involved in Pipeline()
pipe.steps 

(['vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.float64'>, encoding='utf-8',
                  input='content', lowercase=True, max_df=1.0, max_features=None,
                  min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                  smooth_idf=True, stop_words='english', strip_accents=None,
                  sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                  tokenizer=None, use_idf=True, vocabulary=None)],
 ['clf',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False)])

In [14]:
#Fitting GridSearchCV()
grid.fit(X_train,y_train)



GridSearchCV(cv=KFold(n_splits=5, random_state=10, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=(['vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ng

In [15]:
print('Best Parameters :{}'.format(grid.best_params_))
print('Best cross-validation score:{}'.format(grid.best_score_))

Best Parameters :{'clf__C': 10}
Best cross-validation score:0.7089164334266294


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [17]:
nb = Pipeline([('vect',tfidf),('clf',MultinomialNB())])

In [18]:
nb.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [19]:
nb.score(X_train,y_train)

0.913234706117553

In [20]:
nb.score(X_test,y_test)

0.6913234706117553

In [21]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [22]:
cv = CountVectorizer(stop_words='english')

In [23]:
new_data = cv.fit_transform(X_train)
new_test = cv.transform(X_test)

In [24]:
new_data.shape

(2501, 8030)

In [25]:
nb = MultinomialNB()
nb.fit(new_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
nb.score(new_test,y_test)

0.7157137145141943

In [27]:
nb.score(new_data,y_train)

0.9568172730907637