In [21]:
import numpy as np
import pandas as pd
import nltk
import string

from bs4 import BeautifulSoup
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib


In [3]:
df = pd. read_csv('test.csv',header=None, delimiter=',', skiprows=1, names=['text','label'])
df.shape

(5000, 2)

In [4]:
ds = df[["text"]]
df.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [5]:
ds["lower"] = ds["text"].str.lower()
ds.head()

Unnamed: 0,text,lower
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did..."


In [6]:
def remove_html_tags(text):
    return BeautifulSoup(text, "lxml").text

ds["no_tags"] = ds["lower"].apply(lambda text: remove_html_tags(text))
ds.head()

Unnamed: 0,text,lower,no_tags
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang (secr...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did..."


In [7]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans(' ', ' ', PUNCT_TO_REMOVE))

ds["no_punct"] = ds["no_tags"].apply(lambda text: remove_punctuation(text))
ds.head()

Unnamed: 0,text,lower,no_tags,no_punct
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 1272002 3 out of 10dirsteve purce...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang secre...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...",when i first read about berlin am meer i didnt...


In [8]:
def remove_numbers(text):
    
    return " ".join([line for line in str(text).split() if not line.isdigit()])

ds["no_num"] = ds["no_punct"].apply(lambda text: remove_numbers(text))
ds.head()

Unnamed: 0,text,lower,no_tags,no_punct,no_num
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 1272002 3 out of 10dirsteve purce...,1st watched out of 10dirsteve purcell typical ...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang secre...,the most interesting thing about miryang secre...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...",when i first read about berlin am meer i didnt...,when i first read about berlin am meer i didnt...


In [9]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [10]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

ds["no_stopwords"] = ds["no_num"].apply(lambda text: remove_stopwords(text))
ds.head()

Unnamed: 0,text,lower,no_tags,no_punct,no_num,no_stopwords
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,always wrote series complete stinkfest jim bel...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 1272002 3 out of 10dirsteve purce...,1st watched out of 10dirsteve purcell typical ...,1st watched 10dirsteve purcell typical mary ka...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,movie poorly written directed fell asleep minu...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang secre...,the most interesting thing about miryang secre...,interesting thing miryang secret sunshine acto...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...",when i first read about berlin am meer i didnt...,when i first read about berlin am meer i didnt...,first read berlin meer didnt expect much thoug...


In [11]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

ds["lemmatized"] = ds["no_stopwords"].apply(lambda text: lemmatize_words(text))
ds.head()

Unnamed: 0,text,lower,no_tags,no_punct,no_num,no_stopwords,lemmatized
0,I always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,i always wrote this series off as being a comp...,always wrote series complete stinkfest jim bel...,always write series complete stinkfest jim bel...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 12/7/2002 - 3 out of 10(dir-steve ...,1st watched 1272002 3 out of 10dirsteve purce...,1st watched out of 10dirsteve purcell typical ...,1st watched 10dirsteve purcell typical mary ka...,1st watch 10dirsteve purcell typical mary kate...
2,This movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,this movie was so poorly written and directed ...,movie poorly written directed fell asleep minu...,movie poorly write direct fell asleep minute m...
3,The most interesting thing about Miryang (Secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang (secr...,the most interesting thing about miryang secre...,the most interesting thing about miryang secre...,interesting thing miryang secret sunshine acto...,interesting thing miryang secret sunshine acto...
4,"when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...","when i first read about ""berlin am meer"" i did...",when i first read about berlin am meer i didnt...,when i first read about berlin am meer i didnt...,first read berlin meer didnt expect much thoug...,first read berlin meer didnt expect much think...


In [12]:
vectorizer = TfidfVectorizer(min_df=0.0015, ngram_range=(1,3))
fitted_vectorizer = vectorizer.fit(ds["lemmatized"])

In [13]:
len(fitted_vectorizer.vocabulary_)

10734

In [14]:
fitted_vectorizer.vocabulary_

{'always': 358,
 'write': 10630,
 'series': 8417,
 'complete': 1687,
 'jim': 4879,
 'belushi': 851,
 'involve': 4769,
 'heavily': 4275,
 'one': 6675,
 'day': 2033,
 'tragic': 9706,
 'occur': 6617,
 'white': 10395,
 'game': 3700,
 'end': 2661,
 'realize': 7630,
 'remote': 7814,
 'way': 10263,
 'side': 8576,
 'room': 8009,
 'somehow': 8753,
 'could': 1827,
 'get': 3744,
 'across': 94,
 'even': 2800,
 'tv': 9816,
 'turn': 9807,
 'channel': 1352,
 'walk': 10129,
 'country': 1875,
 'watch': 10216,
 'another': 434,
 'state': 8943,
 'nut': 6593,
 'say': 8131,
 'hang': 4184,
 'tight': 9564,
 'couch': 1826,
 'take': 9263,
 'whatever': 10378,
 'fate': 3102,
 'store': 9010,
 'episode': 2757,
 'show': 8528,
 'remember': 7800,
 'little': 5361,
 'except': 2928,
 'make': 5578,
 'broad': 1103,
 'general': 3722,
 'sweep': 9234,
 'judgment': 4921,
 'base': 763,
 'zero': 10729,
 'objective': 6599,
 'evidence': 2900,
 'nothing': 6554,
 'whatsoever': 10382,
 'back': 663,
 'opinion': 6809,
 'completely': 16

In [15]:
train_transform = fitted_vectorizer.transform(ds['text'])
y = df['label']
print(train_transform)
y

  (0, 10729)	0.03186366145776506
  (0, 10659)	0.01620587921784738
  (0, 10548)	0.013165859690184523
  (0, 10486)	0.023268031264045943
  (0, 10461)	0.019279358430766065
  (0, 10438)	0.027112593613534013
  (0, 10395)	0.025066944258413378
  (0, 10382)	0.032945055166674724
  (0, 10378)	0.029222478948662763
  (0, 10310)	0.012585794368521482
  (0, 10263)	0.014566634920180467
  (0, 10247)	0.04298228816021065
  (0, 10216)	0.012564682217361789
  (0, 10129)	0.02619760696224098
  (0, 10072)	0.022952505120751702
  (0, 9821)	0.04204012511161501
  (0, 9816)	0.06328236652790126
  (0, 9807)	0.018630132333408947
  (0, 9706)	0.03178162047181766
  (0, 9680)	0.029330658085709137
  (0, 9564)	0.037181544175310006
  (0, 9402)	0.02278192767714613
  (0, 9391)	0.850071209317462
  (0, 9263)	0.014645120257848681
  (0, 9107)	0.02377413777453477
  :	:
  (4999, 6027)	0.07881562647403094
  (4999, 5948)	0.10429878315154816
  (4999, 5857)	0.08004294842718586
  (4999, 5829)	0.08585461900027652
  (4999, 5478)	0.046059146

0       0
1       0
2       0
3       1
4       0
5       1
6       0
7       1
8       0
9       0
10      0
11      1
12      0
13      0
14      0
15      0
16      0
17      0
18      1
19      1
20      1
21      0
22      1
23      1
24      0
25      1
26      1
27      0
28      0
29      0
       ..
4970    0
4971    0
4972    0
4973    1
4974    1
4975    1
4976    1
4977    1
4978    0
4979    0
4980    1
4981    0
4982    1
4983    1
4984    1
4985    0
4986    0
4987    1
4988    0
4989    1
4990    1
4991    1
4992    0
4993    1
4994    1
4995    1
4996    1
4997    0
4998    0
4999    0
Name: label, Length: 5000, dtype: int64

In [16]:
print(type(train_transform),
  train_transform.get_shape(),
  train_transform.ndim,
  train_transform.size)

<class 'scipy.sparse.csr.csr_matrix'> (5000, 10734) 2 403297


In [17]:
train_array = train_transform.toarray()
print(type(train_array),
  train_array.shape,
  train_array.ndim,
  train_array.size)

<class 'numpy.ndarray'> (5000, 10734) 2 53670000


In [18]:
print(type(y),
    y.shape,
    y.ndim,
  y.size)

<class 'pandas.core.series.Series'> (5000,) 1 5000


In [19]:
class Logistic_Regression:
    
    # declaring learning rate & number of iterations (Hyperparameters)
    def __init__(self, learning_rate, no_of_iterations):

        self.learning_rate = learning_rate
        self.no_of_iterations = no_of_iterations

    def sigmoid(self, z):
        return 1/ (1 + np.exp(-z))
         
    # fit function to train the model with dataset
    def fit(self, X, Y):

        # number of data points in the dataset (number of rows)  -->  m
        # number of input features in the dataset (number of columns)  --> n
        self.m, self.n = X.shape

        #initiating weight & bias value

        self.weights = np.zeros(self.n)
        self.bias = 0
        
        self.X = X
        self.Y = Y     
        
        # implementing Gradient Descent for Optimization

        for i in range(self.no_of_iterations):
            self.update_model_parameters()
            

    def update_model_parameters(self):

        # Y_hat formula (sigmoid function)

        Y_hat = self.sigmoid( self.X.dot(self.weights) + self.bias )  



        # derivaties

        dw = (1/self.m)*np.dot(self.X.T, (Y_hat - self.Y))
        db = (1/self.m)*np.sum(Y_hat - self.Y)

        # updating the weights & bias using gradient descent

        self.weights = self.weights - dw * self.learning_rate
        self.bias = self.bias - self.learning_rate * db
                
            
    # Sigmoid Equation & Decision Boundary

    def predict(self, X):

        Y_pred = 1 / (1 + np.exp( - (X.dot(self.weights) + self.bias ) ))     
        Y_pred = np.where( Y_pred > 0.5, 1, 0)
        return Y_pred

In [20]:
X_test = train_array
y_test = y

In [19]:
X_train, X_test, y_train, y_test = train_test_split(train_array, y,
                                                    test_size=0.3)

In [20]:
model = Logistic_Regression(learning_rate=0.9, no_of_iterations=10000)

In [21]:
model.fit(X_train, y_train)

In [22]:
predictions = model.predict(X_test)

In [24]:
print(accuracy_score(y_test, predictions))

0.852


In [29]:
confusion_matrix(y_test, predictions)

array([[190,  54],
       [ 51, 205]], dtype=int64)

In [30]:
df['label'].value_counts()

1    2505
0    2495
Name: label, dtype: int64

In [32]:
y_test.shape

(500,)

# testing manually

In [48]:
text=["The movie is good. It has interesting plot. Although some of the scenes were gruesome, nevertheless overall the movie is quite satisfying to watch. I would recommend to watch this movie if you're looking for some new plot to see."]

In [49]:
train_transform1 = fitted_vectorizer.transform(text)

In [50]:
text_array = train_transform1.toarray()

In [51]:
predictions = model.predict(text_array)
predictions

array([0])

In [52]:
if predictions[0] == 0:
    print("Negative")
else:
    print("Positive")

Negative


In [58]:
model1 = Logistic_Regression(learning_rate=0.5, no_of_iterations=10000)

In [59]:
model1.fit(X_train, y_train)

In [65]:
text1=["Hi, Everyone, Oh, Boy... This one is a lulu. It has really bad background music whenever they can squeeze it in. There are three bad guys who, I guess, are the stars of this. They beat people up and chop people up and crash trucks and bulldozers into people. Usual stuff.<br /><br />The woman who is sending them on their missions is unable to move her mouth when she speaks. It's sort of like watching a bad ventriloquist who is her own dummy. She walks like she is balancing an egg on her head.<br /><br />The wardrobe is 70s leisure style for the men and blah for the female lead who is supposed to be a good nurse. The bad novocain mouth woman wears red. A silk frock perhaps, or maybe just a poplin windbreaker that is too big.<br /><br />I actually liked the ending even though it did not make a lot of sense. It lets us in on what happened earlier in the film.<br /><br />The police officers are OK. Some bad, some good, all stupid except two. The two bright ones could have worked again in Hollywood.<br /><br />The movie starts interestingly enough and ends with a surprise. The middle sucks. The guy in the diner who gives a free hamburger to the star does a good job. He is like a 1940s character actor. Great voice.<br /><br />This one is a bit too long. The lady with marbles in her mouth could have had just a couple of lines and the rest could have been said by a parrot. It would have been easier to understand a bird.<br /><br />Her scene with a sword could have been handled by a trained woodpecker.<br /><br />Tom Willett"]

In [66]:
predictions1 = model1.predict(text_array)
predictions1

array([1])

In [55]:
if predictions1[0] == 0:
    print("Negative")
else:
    print("Positive")

Positive


In [67]:
train_transform2 = fitted_vectorizer.transform(text1)
text_array2 = train_transform2.toarray()
text_array2

array([[0., 0., 0., ..., 0., 0., 0.]])

In [68]:
predictions2 = model1.predict(text_array2)
predictions2

array([0])

In [69]:
if predictions2[0] == 0:
    print("Negative")
else:
    print("Positive")

Negative
