### Project: Sentiments analysis on movie reviews

In [1]:
import pandas as pd
import numpy as np

In [2]:
#stop = stopwords.words('english')

In [3]:
df_new = pd.read_csv('movie_reviews.csv')

In [4]:
df_new.head()

Unnamed: 0,review,sentiment
0,This is one of those unfortunate films that su...,1
1,Okay maybe it was because I happen to be in Ya...,1
2,"Although I love this movie, I can barely watch...",1
3,"A man arrives in a strange, beautiful, sterile...",1
4,I'm sitting around going through movie listing...,1


In [5]:
df_new.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [6]:
#df_new.sentiment[:100]

In [7]:
df_new.loc[1000,'review']

"I rented this movie last week. I saw Kevin Spacey and Morgan Freeman were on it, so it seemed promising. And it was, until Justin Timberlake came on scene. He is a really bad actor and shouldn't be allowed to make a movie ever again. I mean, he is one of the most boring, uninspired actors I've ever seen. He puts absolutely no emotion to any of his lines whatsoever. Why the hell was he cast for the role of Josh Pollack? I think Matt Damon would have been a better choice.<br /><br />Kevin Spacey was another big disappointment. His character is so dull, it seems like a bad mix of his character in American Beauty and John Doe in Se7en. It might sound cool, but believe me, it's not.<br /><br />Now, Dylan McDermott's acting is very good. It's about one of the very few good things about this movie. He is just inspired.<br /><br />Morgan Freeman is good but nothing special. He has some really cool lines though.<br /><br />About the story, although it was a bit obvious and exaggerated at times

In [8]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

# \W =matches any non-alphanumeric character;
# \D = matches any non-digit character
#<[^>]*> :all tag  ex..<br />, <a>
# [^>] :except '>' 

In [9]:
preprocessor("</a>This is a test :-)!</a>")

'this is a test :'

In [10]:
df_new['review'] = df_new['review'].apply(preprocessor)

In [11]:
X_train = df_new.loc[:2500, 'review'].values
y_train = df_new.loc[:2500, 'sentiment'].values
X_test = df_new.loc[2500:5000, 'review'].values
y_test = df_new.loc[2500:5000, 'sentiment'].values

In [12]:
print (np.bincount(y_test))
print (np.unique(y_test))

print (np.bincount(y_train))

[1220 1281]
[0 1]
[1282 1219]


In [13]:
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [14]:
tfidf = TfidfVectorizer(stop_words='english')

param_grid = {'clf__C': [1.0, 10.0, 100.0]}

In [15]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression())])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',)

#lr_tfidf.steps

In [16]:
lr_tfidf.steps

[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words='english', strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('clf',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [17]:
gs_lr_tfidf.fit(X_train, y_train)
#print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
#print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__C': [1.0, 10.0, 100.0]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [18]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 10.0} 
CV Accuracy: 0.837


In [19]:
clf = gs_lr_tfidf.best_estimator_

In [20]:
clf.score(X_test,y_test)

0.8548580567772891

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [22]:
nb = Pipeline([('vect', tfidf),
               ('clf', MultinomialNB())])


In [23]:
nb.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:
nb.score(X_train,y_train)

0.9540183926429429

In [25]:
nb.score(X_test,y_test)

0.7896841263494602

In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [27]:
cv = CountVectorizer(stop_words='english')
#cv = TfidfVectorizer(stop_words='english')

In [28]:
new_data = cv.fit_transform(X_train)
new_test = cv.transform(X_test)

In [29]:
new_data.shape

(2501, 28031)

In [30]:
nb = MultinomialNB()
nb.fit(new_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
nb.score(new_test,y_test)

0.8208716513394642

In [32]:
nb.score(new_data,y_train)

0.9684126349460216