In [66]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

import nltk
from nltk.stem.porter import PorterStemmer

In [6]:
df = pd.read_csv("movie_data.csv")

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


# Bag of words/Bag of N-grams model

In [17]:
# for understanding purpose

docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'THe sun is shining, the weather is sweet, and one and one is two']) 

In [18]:
count = CountVectorizer()

bag = count.fit_transform(docs)

In [21]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [24]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


# Term Frequency-inverse frequency

In [27]:

#for precision
np.set_printoptions(precision=2)

#fitting 
tfidf = TfidfTransformer(use_idf = True, norm='l2', smooth_idf = True)
fitted = tfidf.fit_transform(bag)


In [28]:
print(fitted.toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


# Data Preparation

In [35]:
df.loc[1020, 'review'][-50:]

"s'. If you liked those, you'll probably like this."

In [42]:
df.loc[897, 'review'][-50:]

'm. or go watch boondock saints, it is MUCH better.'

In [39]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [47]:
#Writing a preprocessor funciton

import re
def prep(text):
    text = re.sub('<[^>]*>','', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-','')
    return text

In [48]:
prep(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [49]:
# Applying to out dataset

df['review'] = df['review'].apply(prep)

# Tokenization of documents

In [51]:
porter = PorterStemmer()

In [52]:
def tokenizer(text):
    return text.split()

In [55]:
def tokenizer_porter(text):  #tokenizer_stem
    return [porter.stem(word) for word in text.split()]

In [56]:
tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [58]:
# downloading the nltk stopwords to protect some words

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Robin
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [59]:
from nltk.corpus import stopwords

stop = stopwords.words('english') # using the words of english
[w for w in tokenizer_porter('a runner like running and thus they runs lots') if w not in stop]

['runner', 'like', 'run', 'thu', 'run', 'lot']

# Trasnsform Text data into TF-IDF Vectors

In [63]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None,
                       tokenizer=tokenizer_porter,
                       use_idf=True,
                       norm='l2',
                       smooth_idf=True)

y = df.sentiment.values
x = tfidf.fit_transform(df.review)

# Document classification using Logistic Regression

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1,
                                                    test_size=0.5,
                                                   shuffle=False)

In [67]:
import pickle

In [70]:
clf = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           max_iter=300,
                           n_jobs=-1,
                           verbose=3,
                           random_state=0).fit(x_train,y_train)

saved_model = open('saved_model.sav','wb')
pickle.dump(clf, saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.8min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.0min finished


# Model Evaluation

In [73]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename, 'rb')) # rb - read bytes

In [75]:
saved_clf.score(x_test, y_test)

0.89608

# Checking the custom input

In [81]:
import string

def analyze(text):
    # Prepare the input by removing punctuation characters, converting
    # characters to lower case, and removing words containing numbers
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    text = text.lower().split(' ')
    text = [word for word in text if word.isalpha()]

    input = [1]
    for word in text:
            input.append(word)
    padded_input = sequence.pad_sequences([input], maxlen=max_review_length)

    # Invoke the model and return the result
    result = clf.predict(np.array([padded_input][0]))[0][0]
    return result

In [82]:
analyze("you")

NameError: name 'sequence' is not defined