# Libraries 

In [1]:
import pandas as pd 
import numpy as np

# Importing data 

In [2]:
df = pd.read_csv('movie_data.csv')

Checking Data 

In [3]:
df.head(10)

Unnamed: 0,review,sentiment
0,This movie is just crap. Even though the direc...,0
1,Another detailed work on the subject by Dr Dwi...,1
2,THE CAT O'NINE TAILS (Il Gatto a Nove Code) <b...,0
3,"Like with any movie genre, there are good gang...",0
4,I watched it with my mom and we were like...<b...,0
5,This movie is probably one of 3 worst movies m...,0
6,"this movie is quite bad, aggressive, not playe...",0
7,And a perfect film to watch during the holiday...,1
8,"I like Noel Coward, the wit. I like Noel Cowar...",0
9,"""The Days"" is a typical family drama with a li...",1


In [4]:
df.shape

(49969, 2)

In [5]:
df.dtypes

review       object
sentiment     int64
dtype: object

In [6]:
#first look

df['review'][0]



# Bag of Words 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
count = CountVectorizer()

In [9]:
#Understanding 

docs = np.array(['The sun is shining', 
                 'The weather is sweet', 
                 'The sun is shining, the weather is sweet, and one and one is two']) 

In [10]:
bag = count.fit_transform(docs)

In [11]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [12]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [13]:
from sklearn.feature_extraction.text import TfidfTransformer 

In [14]:
tfidf = TfidfTransformer(use_idf = True, norm = 'l2', smooth_idf = True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


# Data Preparation 

In [15]:
df.loc[0, 'review'][-500:]

" clichÃ©s, doesn't tell anything new or provocative and (-that's the sad thing about this movie) it's far from any Oi!-Reality.<br /><br />If you wanna see weird but great German films, watch the movies of Michael Haneke, Christoph Schlingensief, Oskar Roehler, Hans Weingartner or Oliver Hirschbiegel:<br /><br />Benny's Video Funny Games Die UnberÃ¼hrbare Mein Letzter Film Das Experiment Das Weisse Rauschen MuxmÃ¤uschenstill ...<br /><br />*** out of ten, because of the topic and the photography"

In [16]:
#removing strange caracters as html tags, emoticons an so on 

import re 
def preprocessor(text):
    text = re.sub('<[^>]*>', '',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text=re.sub('[\W]+',' ',text.lower())+\
        ' '.join(emoticons).replace('-','')
    return text

In [17]:
preprocessor(df.loc[0, 'review'][-500:])

' clichã s doesn t tell anything new or provocative and that s the sad thing about this movie it s far from any oi reality if you wanna see weird but great german films watch the movies of michael haneke christoph schlingensief oskar roehler hans weingartner or oliver hirschbiegel benny s video funny games die unberã¼hrbare mein letzter film das experiment das weisse rauschen muxmã uschenstill out of ten because of the topic and the photography'

In [18]:
#testing 


preprocessor("</a>This :) is a :( test :-)!")

'this is a test :) :( :)'

In [19]:
df['review']= df['review'].apply(preprocessor)

# Tokenization of documents 

In [20]:
from nltk.stem.porter  import PorterStemmer

In [21]:
porter = PorterStemmer()

In [22]:
#defining a split function

def tokenizer(text):
    return text.split()

In [23]:
#split function with porter stem 

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [24]:
#testing 

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [25]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafaelmello/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [28]:
#Removing stopwords

[w for w in tokenizer_porter('a runner like running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Transform Text Data into TF-IDF Vectors

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [30]:
tfidf = TfidfVectorizer(strip_accents = None,
                      lowercase=False,
                      preprocessor=None,
                      tokenizer=tokenizer_porter,
                      use_idf=True,
                      norm = 'l2',
                      smooth_idf=True)

In [31]:
y = df.sentiment.values
X = tfidf.fit_transform(df.review)

# Document Classification Using Logistic Regression 

In [32]:
from sklearn.model_selection import train_test_split 

In [35]:
X_train, X_test, y_train, y_test = train_test_split (X, y, random_state = 1, test_size= 0.5, shuffle=False)

In [37]:
import pickle 
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=5,
                          scoring='accuracy',
                          random_state=0,
                          n_jobs=-1,
                          verbose=3,
                          max_iter=300).fit(X_train, y_train)
saved_model = open('saved_model.sav', 'wb')
pickle.dump(clf, saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   41.8s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   43.2s finished


# Model Evaluation 

In [38]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename,'rb'))

In [39]:
saved_clf.score(X_test, y_test)

0.8969381628977386