In [93]:
import pandas as pd
import numpy as np

In [94]:
movie_df = pd.read_csv('imdb.csv')

In [95]:
movie_df.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [96]:
print('Colums : ',movie_df.columns)
print('***********************************')
print('Shape : {}'.format(movie_df.shape))

Colums :  Index(['review', 'sentiment'], dtype='object')
***********************************
Shape : (50000, 2)


# Cleaning text:

In [97]:
import re

In [98]:
#let' define function to preprocess the text
def preprocess_doc(text):
    assert(isinstance(text,str))
    #Let's first remove the HTML markups
    text = re.sub('<[^<]*>'," ",text) #removes every thing that looks similar to <........>
    pat = '(?::|;|=)(?:-)?(?:\)|\(|D|P)' # | indicates or
    emtos = re.findall(pat,text)
    text = re.sub('[\W]+'," ",text.lower()) # Removes every non alpha numeric character
    text = text+' '.join(emtos)
    return text

In [99]:
txt = movie_df.loc[50, 'review'][-50:]
print(txt)

n on screen before. Give them a break!<br /><br />


In [100]:
txt = preprocess_doc(text=txt)
print(txt)

n on screen before give them a break 


In [101]:
print(preprocess_doc("</a>This :) is :( a test :-)!"))

 this is a test :) :( :-)


# Tokenizing function:



In [102]:
from nltk.stem.porter import PorterStemmer
porter_stem = PorterStemmer()
def tokenize(text):
    return [porter_stem.stem(word=word) for word in text.split()]
        

In [103]:
arr = np.array(['runners like running and thus they run'])
print(tokenize(arr[0]))

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']


In [105]:
movie_df['review'] =movie_df['review'].apply(preprocess_doc)

# Stopwords removal:

In [106]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sravanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [107]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Data Splitting:

In [108]:
from sklearn.model_selection import train_test_split as tts
X_train,X_test,y_train,y_test = tts(movie_df['review'],movie_df['sentiment'],
                                    test_size=0.2,stratify = movie_df['sentiment'])

In [109]:
X_train.shape

(40000,)

In [110]:
y_train.shape

(40000,)

# Pipelining:

In [111]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [115]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,
                       preprocessor=None,stop_words=None,
                       tokenizer=tokenize,ngram_range=(1,1))

In [116]:
Log_reg = LogisticRegression(penalty='l2',C=10.0,solver='liblinear',random_state=0)

In [117]:
Model_Pipe = Pipeline([('vect',tfidf),
                      ('clf',Log_reg)]) 

In [118]:
Model_Pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...\\w+\\b',
                                 tokenizer=<function tokenize at 0x000001A47D4E7950>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=10.0, class_weight=None, dual=False,
                                    fit_in

In [121]:
from sklearn.metrics import accuracy_score
train_pred = Model_Pipe.predict(X_train)
test_pred = Model_Pipe.predict(X_test)

In [122]:
print("Train accuracy : {}".format(accuracy_score(train_pred,y_train)))
print("Test accuracy : {}".format(accuracy_score(test_pred,y_test)))

Train accuracy : 0.973875
Test accuracy : 0.8994


* This model has low bias and some what high variance
* We can tune the models using grid search but now i'm not going to do that here since it is very time cunsuming process

# List of Parameters we can choose in grid_search :

In [135]:
Get_param = Model_Pipe.get_params()
for key in Get_param.keys():
    match = re.match('clf_.*|vect_.*',key)
    if match:
        print(key,':',Get_param[key])

vect__analyzer : word
vect__binary : False
vect__decode_error : strict
vect__dtype : <class 'numpy.float64'>
vect__encoding : utf-8
vect__input : content
vect__lowercase : False
vect__max_df : 1.0
vect__max_features : None
vect__min_df : 1
vect__ngram_range : (1, 1)
vect__norm : l2
vect__preprocessor : None
vect__smooth_idf : True
vect__stop_words : None
vect__strip_accents : None
vect__sublinear_tf : False
vect__token_pattern : (?u)\b\w\w+\b
vect__tokenizer : <function tokenize at 0x000001A47D4E7950>
vect__use_idf : True
vect__vocabulary : None
clf__C : 10.0
clf__class_weight : None
clf__dual : False
clf__fit_intercept : True
clf__intercept_scaling : 1
clf__l1_ratio : None
clf__max_iter : 100
clf__multi_class : warn
clf__n_jobs : None
clf__penalty : l2
clf__random_state : 0
clf__solver : liblinear
clf__tol : 0.0001
clf__verbose : 0
clf__warm_start : False


* These are the allowed hyperparamters to tune.