In [33]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

import pickle

from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Saloni
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
df = pd.read_csv('Movie-Review-Sentiment-Analysis\movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [26]:
count = CountVectorizer()

In [27]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [28]:
df['review'] = df['review'].apply(preprocessor)

In [29]:
porter = PorterStemmer()

In [30]:
def tokenizer(text):
    return text.split()

In [31]:
def stemming(text):
    return [porter.stem(word) for word in text.split()]

In [32]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase = False,
                       preprocessor = None,
                       tokenizer = stemming,
                       use_idf = True,
                       norm = 'l2',
                       smooth_idf = True)

y = df.sentiment.values
x = tfidf.fit_transform(df.review)

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size=0.5, shuffle = False)

In [35]:
clf = LogisticRegressionCV(cv=5,
                          scoring='accuracy',
                          random_state = 0,
                          n_jobs = -1,
                          verbose = 3,
                          max_iter = 300).fit(x_train, y_train)

saved = open('saved.sav', 'wb')
pickle.dump(clf, saved)
saved.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.8min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.9min finished


In [36]:
filename = 'saved.sav'
saved_clf = pickle.load(open(filename, 'rb'))

In [39]:
saved_clf.score(x_test, y_test)

0.89608