In [2]:
import pandas as pd
import os 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

#nltk.download('stopwords')
from nltk.corpus import stopwords

In [3]:
basepath = 'aclImdb'

In [4]:
labels = {'pos': 1, 'neg': 0}

In [5]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.rename(columns={'0': 'review', '1': 'sentiment'})

In [6]:
print(df.shape)
df.head(3)

(50000, 2)


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [7]:
'''
An example showing how CountVectorizer() creates the bag of words model for
text data
'''
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)

'''
By calling fit_transform() method, we constructed the vocabulary of the bag
of words model and transformed the following 3 sentences into sparse feature
vectors
'''

'\nBy calling fit_transform() method, we constructed the vocabulary of the bag\nof words model and transformed the following 3 sentences into sparse feature\nvectors\n'

In [8]:
print(count.vocabulary_)
print(bag.toarray())
'''
Indices in the matrix correspond to the value that corresponds to a key in
the vocabulary.
Eg: first index corresponds to 'and' as it has value = 0 in vocabulary
'''

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


"\nIndices in the matrix correspond to the value that corresponds to a key in\nthe vocabulary.\nEg: first index corresponds to 'and' as it has value = 0 in vocabulary\n"

In [9]:
'''
Assessing term frequency-inverse document frequency
[Previously, only term frequency used]

IMPORTANT: tf-idf evaluates the importance of a word in a document 
relative to a collection of documents

Read page 252 of S Raschka book (new one) to understand the concept
'''
tfidf = TfidfTransformer(use_idf=True,
                         smooth_idf=True, #adding 1 to denominator of formula of idf
                         norm='l2')

'''
Sum of squares of vector elements is 1. The cosine similarity between 
two vectors is their dot product when l2 norm has been applied.
'''
tf_arr = count.fit_transform(docs).toarray()
res = tfidf.fit_transform(count.fit_transform(docs)).toarray()
np.set_printoptions(precision=2)
print(f'{tf_arr}\n\n{res}\n')

sum_of_square = [sum(i**2 for i in row) for row in res]
print(sum_of_square)

np.set_printoptions(threshold=None)

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]

[1.0000000000000004, 1.0000000000000004, 1.0]


<b>In the above output, we see that for third vector (third sentence), 'is' is most frequently present (3 times), however, it has a relatively small tf-idf. This is because since it is also present in the first and second document, it is unlikely to contain any useful discriminatory information</b>

In [10]:
'''
Cleaning text data
We see that this contains HTML tags and other things that might not be useful
for sentiment analysis. So we have to remove them
'''
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [11]:
re?

[1;31mType:[0m        module
[1;31mString form:[0m <module 're' from 'c:\\Anaconda\\envs\\ML\\Lib\\re\\__init__.py'>
[1;31mFile:[0m        c:\anaconda\envs\ml\lib\re\__init__.py
[1;31mDocstring:[0m  
Support for regular expressions (RE).

This module provides regular expression matching operations similar to
those found in Perl.  It supports both 8-bit and Unicode strings; both
the pattern and the strings being processed can contain null bytes and
characters outside the US ASCII range.

Regular expressions can contain both special and ordinary characters.
Most ordinary characters, like "A", "a", or "0", are the simplest
regular expressions; they simply match themselves.  You can
concatenate ordinary characters, so last matches the string 'last'.

The special characters are:
    "."      Matches any character except a newline.
    "^"      Matches the start of the string.
    "$"      Matches the end of the string or just before the newline at
             the end of the string.

In [12]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) #removes any HTML tag
    #finding emoticons as these are essential to sentiment analysis
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

    #[\W]+ denotes non-alphanumeric characters 
    text = (re.sub('[\W]+', ' ', text.lower()) + 
            ' '.join(emoticons).replace('-', ''))
    
    return text

In [13]:
#checking the preprocessor
print(df.loc[0, 'review'][-50:])
print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor('</a>This :) is :( a test :-)!'))

is seven.<br /><br />Title (Brazil): Not Available
is seven title brazil not available
this is a test :) :( :)


In [14]:
df['review'] = df['review'].apply(preprocessor)

In [16]:
#processing documents into tokens
def tokenizer(text):
    return text.split()

print(tokenizer('runners like running and thus they run'))

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']


In [17]:
'''
word stemming - related to tokenization
It is the process of transforming a word into its root form. Allows us to map
related words to the same stem. Developed by Martin F. Porter and is known as
Porter stemmer algorithm
'''
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

print(tokenizer_porter('runners like running and thus they run'))

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']


In [18]:
stop = stopwords.words('english')
len(stop)

179

In [19]:
#example
[w for w in tokenizer_porter('a runner likes running and runs a lot')
                             if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [20]:
#now classifying using 25000 training and 25000 test examples
x_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
x_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [21]:
#TfidfVectorizer combines CountVectorizer and TfidfTransformer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None)

docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, and one and one is two'])

arr = tfidf.fit_transform(docs).toarray()

print(tfidf.vocabulary_)
print(arr)

#same as using CountVectorizer and TfidfTransformer sequentially

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [22]:
#using GridSearchCV to find optimal params for logistic regression model
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

small_param_grid = [
    {
        #using only unigrams
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        #checking both tokenizer and tokenizer_porter (word stemming) cases 
        'lr__penalty': ['l2'],
        'lr__C': [1.0, 10.0]
    },
    {
        'vect__ngram_range': [(1, 1)],
        #removing stop words 
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer],
        'vect__use_idf': [False],
        'vect__smooth_idf': [False],
        'vect__norm': [None],
        #user_idf = False, smooth_idf = False and norm = None implies that 
        #we are training model based on raw term frequencies (tf)
        'lr__penalty': ['l2'],
        'lr__C': [1.0, 10.0]
    }
]

lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('lr', LogisticRegression(solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
                           param_grid=small_param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

gs_lr_tfidf.fit(x_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




In [23]:
print(gs_lr_tfidf.best_params_)

{'lr__C': 10.0, 'lr__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001E2E91DF740>}


In [24]:
print(f'CV Accuracy: {gs_lr_tfidf.best_score_}')

CV Accuracy: 0.8970842631473704


In [25]:
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(x_test, y_test)}')

Test Accuracy: 0.89876
