# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

import nltk
import string
pd.options.mode.chained_assignment = None

from bs4 import BeautifulSoup
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.externals import joblib

# Load Train dataset

In [2]:
train_df = pd. read_csv('train.csv', header=None, delimiter=',', skiprows=1, names=['text','label'])
train_df.shape

(40000, 2)

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
text     40000 non-null object
label    40000 non-null int64
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [4]:
df1 = train_df[["text"]]
df1["text"] = df1["text"].astype(str)
train_df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


# Raw review text train data

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 1 columns):
text    40000 non-null object
dtypes: object(1)
memory usage: 312.6+ KB


In [6]:
df1["text"]

0        I grew up (b. 1965) watching and loving the Th...
1        When I put this movie in my DVD player, and sa...
2        Why do people who do not know what a particula...
3        Even though I have great interest in Biblical ...
4        Im a die hard Dads Army fan and nothing will e...
5        A terrible movie as everyone has said. What ma...
6        Finally watched this shocking movie last night...
7        I caught this film on AZN on cable. It sounded...
8        It may be the remake of 1987 Autumn's Tale aft...
9        My Super Ex Girlfriend turned out to be a plea...
10       I can't believe people are looking for a plot ...
11       If you haven't seen the gong show TV series th...
12       I have always been a huge fan of "Homicide: Li...
13       Greg Davis and Bryan Daly take some crazed sta...
14       A half-hearted attempt to bring Elvis Presley ...
15       If you want a fun romp with loads of subtle hu...
16       I really wanted to be able to give this film a.

# Preprocessing text review from train dataset

### 1. Lowercasing text

In [7]:
df1["text"] = df1["text"].str.lower()
print(df1["text"])

0        i grew up (b. 1965) watching and loving the th...
1        when i put this movie in my dvd player, and sa...
2        why do people who do not know what a particula...
3        even though i have great interest in biblical ...
4        im a die hard dads army fan and nothing will e...
5        a terrible movie as everyone has said. what ma...
6        finally watched this shocking movie last night...
7        i caught this film on azn on cable. it sounded...
8        it may be the remake of 1987 autumn's tale aft...
9        my super ex girlfriend turned out to be a plea...
10       i can't believe people are looking for a plot ...
11       if you haven't seen the gong show tv series th...
12       i have always been a huge fan of "homicide: li...
13       greg davis and bryan daly take some crazed sta...
14       a half-hearted attempt to bring elvis presley ...
15       if you want a fun romp with loads of subtle hu...
16       i really wanted to be able to give this film a.

### 2. Remove HTML tags

In [8]:
def remove_html_tags(text):
    return BeautifulSoup(text, "lxml").text

df1["text"] = df1["text"].apply(lambda text: remove_html_tags(text))
print(df1["text"])

0        i grew up (b. 1965) watching and loving the th...
1        when i put this movie in my dvd player, and sa...
2        why do people who do not know what a particula...
3        even though i have great interest in biblical ...
4        im a die hard dads army fan and nothing will e...
5        a terrible movie as everyone has said. what ma...
6        finally watched this shocking movie last night...
7        i caught this film on azn on cable. it sounded...
8        it may be the remake of 1987 autumn's tale aft...
9        my super ex girlfriend turned out to be a plea...
10       i can't believe people are looking for a plot ...
11       if you haven't seen the gong show tv series th...
12       i have always been a huge fan of "homicide: li...
13       greg davis and bryan daly take some crazed sta...
14       a half-hearted attempt to bring elvis presley ...
15       if you want a fun romp with loads of subtle hu...
16       i really wanted to be able to give this film a.

### 3. Remove stopwords

In [9]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [10]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df1["text"] = df1["text"].apply(lambda text: remove_stopwords(text))
df1["text"]

0        grew (b. 1965) watching loving thunderbirds. m...
1        put movie dvd player, sat coke chips, expectat...
2        people know particular time past like feel nee...
3        even though great interest biblical movies, bo...
4        im die hard dads army fan nothing ever change ...
5        terrible movie everyone said. made laugh cameo...
6        finally watched shocking movie last night, dis...
7        caught film azn cable. sounded like would good...
8        may remake 1987 autumn's tale eleven years, di...
9        super ex girlfriend turned pleasant surprise m...
10       can't believe people looking plot film. laural...
11       seen gong show tv series like movie much all, ...
12       always huge fan "homicide: life street" heard ...
13       greg davis bryan daly take crazed statements t...
14       half-hearted attempt bring elvis presley moder...
15       want fun romp loads subtle humor, enjoy flick....
16       really wanted able give film 10. i've long tho.

### 4. Remove punctuations

In [11]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans(' ', ' ', PUNCT_TO_REMOVE))

df1["text"] = df1["text"].apply(lambda text: remove_punctuation(text))
df1["text"]

0        grew b 1965 watching loving thunderbirds mates...
1        put movie dvd player sat coke chips expectatio...
2        people know particular time past like feel nee...
3        even though great interest biblical movies bor...
4        im die hard dads army fan nothing ever change ...
5        terrible movie everyone said made laugh cameo ...
6        finally watched shocking movie last night dist...
7        caught film azn cable sounded like would good ...
8        may remake 1987 autumns tale eleven years dire...
9        super ex girlfriend turned pleasant surprise m...
10       cant believe people looking plot film laural h...
11       seen gong show tv series like movie much all k...
12       always huge fan homicide life street heard reu...
13       greg davis bryan daly take crazed statements t...
14       halfhearted attempt bring elvis presley modern...
15       want fun romp loads subtle humor enjoy flicki ...
16       really wanted able give film 10 ive long thoug.

### 4. Remove numbers

In [12]:
def remove_numbers(text):
    
    return " ".join([line for line in str(text).split() if not line.isdigit()])

df1["text"] = df1["text"].apply(lambda text: remove_numbers(text))
df1["text"]

0        grew b watching loving thunderbirds mates scho...
1        put movie dvd player sat coke chips expectatio...
2        people know particular time past like feel nee...
3        even though great interest biblical movies bor...
4        im die hard dads army fan nothing ever change ...
5        terrible movie everyone said made laugh cameo ...
6        finally watched shocking movie last night dist...
7        caught film azn cable sounded like would good ...
8        may remake autumns tale eleven years director ...
9        super ex girlfriend turned pleasant surprise m...
10       cant believe people looking plot film laural h...
11       seen gong show tv series like movie much all k...
12       always huge fan homicide life street heard reu...
13       greg davis bryan daly take crazed statements t...
14       halfhearted attempt bring elvis presley modern...
15       want fun romp loads subtle humor enjoy flicki ...
16       really wanted able give film ive long thought .

### 5. Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df1["text"] = df1["text"].apply(lambda text: lemmatize_words(text))
df1["text"]

0        grow b watch loving thunderbird mat school wat...
1        put movie dvd player sit coke chip expectation...
2        people know particular time past like feel nee...
3        even though great interest biblical movie bore...
4        im die hard dad army fan nothing ever change t...
5        terrible movie everyone say make laugh cameo a...
6        finally watch shock movie last night disturb m...
7        caught film azn cable sound like would good fi...
8        may remake autumns tale eleven year director m...
9        super ex girlfriend turn pleasant surprise me ...
10       cant believe people look plot film laural hard...
11       see gong show tv series like movie much all kn...
12       always huge fan homicide life street heard reu...
13       greg davis bryan daly take crazed statement te...
14       halfhearted attempt bring elvis presley modern...
15       want fun romp load subtle humor enjoy flicki u...
16       really want able give film ive long think favo.

# Preprocessed text review train data

In [14]:
df1.head()

Unnamed: 0,text
0,grow b watch loving thunderbird mat school wat...
1,put movie dvd player sit coke chip expectation...
2,people know particular time past like feel nee...
3,even though great interest biblical movie bore...
4,im die hard dad army fan nothing ever change t...


In [15]:
df1['text']

0        grow b watch loving thunderbird mat school wat...
1        put movie dvd player sit coke chip expectation...
2        people know particular time past like feel nee...
3        even though great interest biblical movie bore...
4        im die hard dad army fan nothing ever change t...
5        terrible movie everyone say make laugh cameo a...
6        finally watch shock movie last night disturb m...
7        caught film azn cable sound like would good fi...
8        may remake autumns tale eleven year director m...
9        super ex girlfriend turn pleasant surprise me ...
10       cant believe people look plot film laural hard...
11       see gong show tv series like movie much all kn...
12       always huge fan homicide life street heard reu...
13       greg davis bryan daly take crazed statement te...
14       halfhearted attempt bring elvis presley modern...
15       want fun romp load subtle humor enjoy flicki u...
16       really want able give film ive long think favo.

# Feature Extraction

### Tfidf = Term freqeuncy - Inverse document freqeuncy

### min-df removes terms that are less frequent in 0.15% of whole dataset

### ngram_range takes 1 to 3 word terms as features

In [16]:
vectorizer1 = TfidfVectorizer(ngram_range=(1,3), min_df=0.0015)

In [18]:
fitted_vectorizer1 = vectorizer1.fit(df1["text"])

### Save extracted features for mapping

In [19]:
joblib.dump(fitted_vectorizer1, 'tfidf.joblib') 

['tfidf.joblib']

In [20]:
tf = joblib.load('tfidf.joblib')

In [21]:
print(len(tf.vocabulary_), tf.vocabulary_)



In [22]:
train_transform = tf.transform(df1['text'])
y1 = train_df['label']
print(train_transform)
y1

  (0, 10136)	0.14492936911205515
  (0, 10116)	0.050681302832840294
  (0, 10008)	0.09090907384025766
  (0, 9791)	0.09727953853658737
  (0, 9782)	0.1723366696007372
  (0, 9781)	0.0860486949807661
  (0, 9738)	0.12405859640011765
  (0, 9639)	0.09092944602752799
  (0, 9601)	0.13216692585732512
  (0, 9404)	0.12875022046074963
  (0, 9007)	0.10005711102126379
  (0, 8979)	0.1405541320523677
  (0, 8932)	0.10829820140620439
  (0, 8862)	0.0564046874204292
  (0, 8568)	0.06545813888284828
  (0, 8324)	0.17468501364470862
  (0, 8056)	0.0854916646741642
  (0, 7958)	0.09571357427633882
  (0, 7921)	0.040716571124056904
  (0, 7872)	0.12616965164675467
  (0, 7870)	0.09897276181919287
  (0, 7855)	0.28324460150297526
  (0, 7766)	0.13886830219332008
  (0, 7687)	0.133734911896795
  (0, 7500)	0.1341754630052528
  :	:
  (39999, 3877)	0.11611977202466821
  (39999, 3773)	0.04726456861693279
  (39999, 3381)	0.13497436327782442
  (39999, 3105)	0.13808091528038888
  (39999, 3022)	0.038378941897958305
  (39999, 2815)	

0        0
1        0
2        0
3        0
4        1
5        0
6        1
7        0
8        1
9        1
10       1
11       0
12       1
13       0
14       0
15       1
16       1
17       0
18       0
19       0
20       1
21       0
22       1
23       0
24       1
25       0
26       1
27       1
28       0
29       1
        ..
39970    1
39971    1
39972    0
39973    0
39974    0
39975    0
39976    1
39977    0
39978    0
39979    0
39980    0
39981    1
39982    1
39983    1
39984    0
39985    0
39986    1
39987    1
39988    1
39989    1
39990    1
39991    1
39992    1
39993    0
39994    0
39995    1
39996    1
39997    0
39998    1
39999    1
Name: label, Length: 40000, dtype: int64

In [23]:
print(type(train_transform),
  train_transform.get_shape(),
  train_transform.ndim,
  train_transform.size)

<class 'scipy.sparse.csr.csr_matrix'> (40000, 10267) 2 3868291


In [24]:
train_array = train_transform.toarray()
train_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
print(type(train_array),
  train_array.shape,
  train_array.ndim,
  train_array.size)

<class 'numpy.ndarray'> (40000, 10267) 2 410680000


In [26]:
print(type(y1),
    y1.shape,
    y1.ndim,
  y1.size)

<class 'pandas.core.series.Series'> (40000,) 1 40000


# Logistic Regression with Scikit Library 

In [27]:
X_train1 = train_transform
y_train1 = y1

In [29]:
scikit_log_reg_1 = LogisticRegression(solver='lbfgs', max_iter=1000)
model_1 = scikit_log_reg_1 .fit(X_train1, y_train1)

In [30]:
model_1

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
X_train2 = train_array
y_train2 = y1

In [32]:
scikit_log_reg_2 = LogisticRegression(solver='lbfgs', max_iter=1000)
model_2 = scikit_log_reg_2 .fit(X_train2, y_train2)

In [32]:
model_2

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

# Logistic Regression without Library 

In [2]:
class Logistic_Regression:
    
    # Defining hyper parameters
    def __init__(self, learning_rate, no_of_iterations):
        
        self.learning_rate = learning_rate
        self.no_of_iterations = no_of_iterations
    
    # Sigmoid activation function
    def sigmoid(self, z):
        
          return 1/ (1 + np.exp(-z))
      
    # Fit function for model
    def fit(self, X, Y):
        
        # m -> number of rows in dataset
        # n -> number of columns in dataset
        self.m, self.n = X.shape
        
        # initializing weights and bias model parameters
        self.weights = np.zeros(self.n)
        self.bias = 0
        
        self.X = X
        self.Y = Y

        for i in range(self.no_of_iterations):
              self.update_model_parameters()
    
    # Gradient descent for optimizing model parameters
    def update_model_parameters(self):

        Y_hat = self.sigmoid( self.X.dot(self.weights) + self.bias )  
        
        # using gradient descent optimization algorithm
        dw = (1/self.m)*np.dot(self.X.T, (Y_hat - self.Y))
        db = (1/self.m)*np.sum(Y_hat - self.Y)
        
        # updating weights and bias using learning rate and gradient descent
        self.weights = self.weights - dw * self.learning_rate
        self.bias = self.bias - self.learning_rate * db           

    # Predict function for model
    def predict(self, X):

        Y_pred = 1 / (1 + np.exp( - (X.dot(self.weights) + self.bias ) ))  
        
        # if >0.5 --> positive else negative
        Y_pred = np.where( Y_pred > 0.5, 1, 0)
        return Y_pred

In [34]:
X_train3 = train_array
y_train3 = y1

In [41]:
log_reg_1 = Logistic_Regression(learning_rate=0.9, no_of_iterations=10000)
model_3 = log_reg_1 .fit(X_train3, y_train3)

### Save model using Joblib 

In [48]:
joblib.dump(log_reg_1, 'final-log-model-09.joblib')

['final-log-model-09.joblib']

In [49]:
saved_model = joblib.load('final-log-model-09.joblib')

# Train Accuracy

In [50]:
train_predictions_1 = model_1 .predict(X_train1)
print(accuracy_score(y_train1, train_predictions_1))

0.920625


In [51]:
train_predictions_2 = model_1 .predict(X_train2)
print(accuracy_score(y_train1, train_predictions_2))

0.920625


In [52]:
train_predictions_3 = saved_model .predict(X_train3)
print(accuracy_score(y_train3, train_predictions_3))

0.8931


### Confusion Matrix [ [TP, FN] [FP, TN]]

In [53]:
train_df['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [54]:
confusion_matrix(y_train1, train_predictions_1)

array([[18248,  1771],
       [ 1404, 18577]], dtype=int64)

In [55]:
confusion_matrix(y_train2, train_predictions_2)

array([[18248,  1771],
       [ 1404, 18577]], dtype=int64)

In [56]:
confusion_matrix(y_train3, train_predictions_3)

array([[17650,  2369],
       [ 1907, 18074]], dtype=int64)

### F1 = 2 * (precision * recall) / (precision + recall)

In [57]:
f1_score(y_train1, train_predictions_1)

0.9212725334126807

In [58]:
f1_score(y_train2, train_predictions_2)

0.9212725334126807

In [59]:
f1_score(y_train3, train_predictions_3)

0.8942212547001781

### Classification report

In [60]:
matrix_1 = classification_report(y_train1, train_predictions_1, labels=[1,0])
print('Classification report for train data 1 : \n', matrix_1)

Classification report for train data 1 : 
               precision    recall  f1-score   support

           1       0.91      0.93      0.92     19981
           0       0.93      0.91      0.92     20019

   micro avg       0.92      0.92      0.92     40000
   macro avg       0.92      0.92      0.92     40000
weighted avg       0.92      0.92      0.92     40000



In [61]:
matrix_2 = classification_report(y_train2, train_predictions_2,labels=[1,0])
print('Classification report for train data 2  : \n',matrix_2)

Classification report for train data 2  : 
               precision    recall  f1-score   support

           1       0.91      0.93      0.92     19981
           0       0.93      0.91      0.92     20019

   micro avg       0.92      0.92      0.92     40000
   macro avg       0.92      0.92      0.92     40000
weighted avg       0.92      0.92      0.92     40000



In [62]:
matrix_3 = classification_report(y_train3, train_predictions_3, labels=[1,0])
print('Classification report for train data 3 w/o library : \n', matrix_3)

Classification report for train data 3 w/o library : 
               precision    recall  f1-score   support

           1       0.88      0.90      0.89     19981
           0       0.90      0.88      0.89     20019

   micro avg       0.89      0.89      0.89     40000
   macro avg       0.89      0.89      0.89     40000
weighted avg       0.89      0.89      0.89     40000



# Load test dataset

In [63]:
test_df = pd. read_csv('test.csv', header=None, delimiter=',', skiprows=1, names=['text','label'])
test_df.shape

(5000, 2)

In [64]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
text     5000 non-null object
label    5000 non-null int64
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [65]:
df2 = test_df[["text"]]
df2["text"] = df2["text"].astype(str)
test_df.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


# Raw review text test data

In [66]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 1 columns):
text    5000 non-null object
dtypes: object(1)
memory usage: 39.1+ KB


In [67]:
df2["text"]

0       I always wrote this series off as being a comp...
1       1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...
2       This movie was so poorly written and directed ...
3       The most interesting thing about Miryang (Secr...
4       when i first read about "berlin am meer" i did...
5       I saw this film on September 1st, 2005 in Indi...
6       I saw a screening of this movie last night. I ...
7       William Hurt may not be an American matinee id...
8       IT IS A PIECE OF CRAP! not funny at all. durin...
9       I'M BOUT IT(1997)<br /><br />Developed & publi...
10      I had a recent spectator experience with The P...
11      I really enjoyed the detail that went into the...
12      Didn't the writer for this movie see the other...
13      This movie was really bad. First they didn't e...
14      I think I watched a highly edited version beca...
15      Uwe Boll has done the impossible: create a gam...
16      I felt asleep, watching it!!! (and I had ticke...
17      Brass 

# Preprocessing text review from test dataset

### 1. Lowercasing

In [68]:
df2["text"] = df2["text"].str.lower()
df2["text"]

0       i always wrote this series off as being a comp...
1       1st watched 12/7/2002 - 3 out of 10(dir-steve ...
2       this movie was so poorly written and directed ...
3       the most interesting thing about miryang (secr...
4       when i first read about "berlin am meer" i did...
5       i saw this film on september 1st, 2005 in indi...
6       i saw a screening of this movie last night. i ...
7       william hurt may not be an american matinee id...
8       it is a piece of crap! not funny at all. durin...
9       i'm bout it(1997)<br /><br />developed & publi...
10      i had a recent spectator experience with the p...
11      i really enjoyed the detail that went into the...
12      didn't the writer for this movie see the other...
13      this movie was really bad. first they didn't e...
14      i think i watched a highly edited version beca...
15      uwe boll has done the impossible: create a gam...
16      i felt asleep, watching it!!! (and i had ticke...
17      brass 

### 2. Remove html tags

In [69]:
def remove_html_tags(text):
    return BeautifulSoup(text, "lxml").text

df2["text"] = df2["text"].apply(lambda text: remove_html_tags(text))
print(df2["text"])

0       i always wrote this series off as being a comp...
1       1st watched 12/7/2002 - 3 out of 10(dir-steve ...
2       this movie was so poorly written and directed ...
3       the most interesting thing about miryang (secr...
4       when i first read about "berlin am meer" i did...
5       i saw this film on september 1st, 2005 in indi...
6       i saw a screening of this movie last night. i ...
7       william hurt may not be an american matinee id...
8       it is a piece of crap! not funny at all. durin...
9       i'm bout it(1997)developed & published by no l...
10      i had a recent spectator experience with the p...
11      i really enjoyed the detail that went into the...
12      didn't the writer for this movie see the other...
13      this movie was really bad. first they didn't e...
14      i think i watched a highly edited version beca...
15      uwe boll has done the impossible: create a gam...
16      i felt asleep, watching it!!! (and i had ticke...
17      brass 

### 3. Remove stopwords

In [70]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df2["text"] = df2["text"].apply(lambda text: remove_stopwords(text))
df2["text"]

0       always wrote series complete stink-fest jim be...
1       1st watched 12/7/2002 - 3 10(dir-steve purcell...
2       movie poorly written directed fell asleep 30 m...
3       interesting thing miryang (secret sunshine) ac...
4       first read "berlin meer" expect much. thought ...
5       saw film september 1st, 2005 indianapolis. one...
6       saw screening movie last night. high expectati...
7       william hurt may american matinee idol anymore...
8       piece crap! funny all. whole movie nothing eve...
9       i'm bout it(1997)developed & published limit f...
10      recent spectator experience perfect witness (2...
11      really enjoyed detail went script.jonathan rhy...
12      writer movie see three? loved original, though...
13      movie really bad. first even follow facts it. ...
14      think watched highly edited version nearly gra...
15      uwe boll done impossible: create game adaptati...
16      felt asleep, watching it!!! (and tickets midni...
17      brass 

### 4. Remove punctuations

In [71]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans(' ', ' ', PUNCT_TO_REMOVE))

df2["text"] = df2["text"].apply(lambda text: remove_punctuation(text))
df2["text"]

0       always wrote series complete stinkfest jim bel...
1       1st watched 1272002  3 10dirsteve purcell typi...
2       movie poorly written directed fell asleep 30 m...
3       interesting thing miryang secret sunshine acto...
4       first read berlin meer expect much thought rig...
5       saw film september 1st 2005 indianapolis one j...
6       saw screening movie last night high expectatio...
7       william hurt may american matinee idol anymore...
8       piece crap funny all whole movie nothing ever ...
9       im bout it1997developed  published limit films...
10      recent spectator experience perfect witness 20...
11      really enjoyed detail went scriptjonathan rhys...
12      writer movie see three loved original thought ...
13      movie really bad first even follow facts it ha...
14      think watched highly edited version nearly gra...
15      uwe boll done impossible create game adaptatio...
16      felt asleep watching it and tickets midnight p...
17      brass 

### 5. Remove numbers

In [72]:
def remove_numbers(text):
    
    return " ".join([line for line in str(text).split() if not line.isdigit()])

df2["text"] = df2["text"].apply(lambda text: remove_numbers(text))
df2["text"]

0       always wrote series complete stinkfest jim bel...
1       1st watched 10dirsteve purcell typical mary ka...
2       movie poorly written directed fell asleep minu...
3       interesting thing miryang secret sunshine acto...
4       first read berlin meer expect much thought rig...
5       saw film september 1st indianapolis one judges...
6       saw screening movie last night high expectatio...
7       william hurt may american matinee idol anymore...
8       piece crap funny all whole movie nothing ever ...
9       im bout it1997developed published limit filmsp...
10      recent spectator experience perfect witness ne...
11      really enjoyed detail went scriptjonathan rhys...
12      writer movie see three loved original thought ...
13      movie really bad first even follow facts it ha...
14      think watched highly edited version nearly gra...
15      uwe boll done impossible create game adaptatio...
16      felt asleep watching it and tickets midnight p...
17      brass 

### 6. Lemmatization

In [73]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df2["text"] = df2["text"].apply(lambda text: lemmatize_words(text))
df2["text"]

0       always write series complete stinkfest jim bel...
1       1st watch 10dirsteve purcell typical mary kate...
2       movie poorly write direct fell asleep minute m...
3       interesting thing miryang secret sunshine acto...
4       first read berlin meer expect much think right...
5       saw film september 1st indianapolis one judge ...
6       saw screen movie last night high expectation g...
7       william hurt may american matinee idol anymore...
8       piece crap funny all whole movie nothing ever ...
9       im bout it1997developed publish limit filmspro...
10      recent spectator experience perfect witness ne...
11      really enjoy detail go scriptjonathan rhys mye...
12      writer movie see three love original thought b...
13      movie really bad first even follow fact it hal...
14      think watch highly edit version nearly graphic...
15      uwe boll do impossible create game adaptation ...
16      felt asleep watch it and ticket midnight premi...
17      brass 

# Preprocessed text review test data

In [74]:
df2.head()

Unnamed: 0,text
0,always write series complete stinkfest jim bel...
1,1st watch 10dirsteve purcell typical mary kate...
2,movie poorly write direct fell asleep minute m...
3,interesting thing miryang secret sunshine acto...
4,first read berlin meer expect much think right...


In [75]:
df2["text"]

0       always write series complete stinkfest jim bel...
1       1st watch 10dirsteve purcell typical mary kate...
2       movie poorly write direct fell asleep minute m...
3       interesting thing miryang secret sunshine acto...
4       first read berlin meer expect much think right...
5       saw film september 1st indianapolis one judge ...
6       saw screen movie last night high expectation g...
7       william hurt may american matinee idol anymore...
8       piece crap funny all whole movie nothing ever ...
9       im bout it1997developed publish limit filmspro...
10      recent spectator experience perfect witness ne...
11      really enjoy detail go scriptjonathan rhys mye...
12      writer movie see three love original thought b...
13      movie really bad first even follow fact it hal...
14      think watch highly edit version nearly graphic...
15      uwe boll do impossible create game adaptation ...
16      felt asleep watch it and ticket midnight premi...
17      brass 

# Feature extraction

In [76]:
# max_df=0.8, min_df=0.2, ngram_range=(1,3)
vectorizer2 = TfidfVectorizer(min_df=0.0015, ngram_range=(1,3), vocabulary = tf.vocabulary_)
fitted_vectorizer2 = vectorizer2.fit(df2["text"])

In [77]:
test_transform = fitted_vectorizer2.transform(df2['text'])
y2 = test_df['label']
print(test_transform)
y2

  (0, 10262)	0.06619329709732875
  (0, 10215)	0.0336659545298106
  (0, 10190)	0.039410456532187874
  (0, 10116)	0.02735064402969764
  (0, 10060)	0.04833680863614588
  (0, 10054)	0.04061020079874862
  (0, 10044)	0.04005077389317389
  (0, 10040)	0.05350601232571301
  (0, 10022)	0.05578309276149685
  (0, 9979)	0.05207385506586152
  (0, 9960)	0.06843977511580504
  (0, 9955)	0.06070652719974463
  (0, 9886)	0.026211693977864083
  (0, 9835)	0.030260602481541048
  (0, 9822)	0.0892910368723011
  (0, 9791)	0.02608717730326806
  (0, 9723)	0.05455921158946931
  (0, 9667)	0.047681337331546
  (0, 9418)	0.13146224561027678
  (0, 9410)	0.038662767732076005
  (0, 9395)	0.0665411801408254
  (0, 9381)	0.03420690691918195
  (0, 9316)	0.06602286585658222
  (0, 9289)	0.06093125761146253
  (0, 9171)	0.07724062105656004
  :	:
  (4999, 4939)	0.15775846388334405
  (4999, 4882)	0.15935485765635637
  (4999, 4581)	0.056276026275451335
  (4999, 4459)	0.17930656539241163
  (4999, 4101)	0.09311092794279184
  (4999, 3

0       0
1       0
2       0
3       1
4       0
5       1
6       0
7       1
8       0
9       0
10      0
11      1
12      0
13      0
14      0
15      0
16      0
17      0
18      1
19      1
20      1
21      0
22      1
23      1
24      0
25      1
26      1
27      0
28      0
29      0
       ..
4970    0
4971    0
4972    0
4973    1
4974    1
4975    1
4976    1
4977    1
4978    0
4979    0
4980    1
4981    0
4982    1
4983    1
4984    1
4985    0
4986    0
4987    1
4988    0
4989    1
4990    1
4991    1
4992    0
4993    1
4994    1
4995    1
4996    1
4997    0
4998    0
4999    0
Name: label, Length: 5000, dtype: int64

In [78]:
print(type(test_transform),
  test_transform.get_shape(),
  test_transform.ndim,
  test_transform.size)

<class 'scipy.sparse.csr.csr_matrix'> (5000, 10267) 2 483209


In [79]:
test_array = test_transform.toarray()
test_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [80]:
print(type(test_array),
  test_array.shape,
  test_array.ndim,
  test_array.size)

<class 'numpy.ndarray'> (5000, 10267) 2 51335000


In [81]:
print(type(y2),
    y2.shape,
    y2.ndim,
  y2.size)

<class 'pandas.core.series.Series'> (5000,) 1 5000


# Logistic Rregression with Scikit Library

In [82]:
X_test1 = test_transform
y_test1 = y2

In [83]:
# scikit_log_reg_3 = LogisticRegression(solver='lbfgs')
# model_4 = scikit_log_reg_3 .fit(X_test1, y_test1)

In [84]:
# model_4

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [85]:
X_test2 = test_array
y_test2 = y2

In [86]:
# scikit_log_reg_4 = LogisticRegression(solver='lbfgs')
# model_5 = scikit_log_reg_4 .fit(X_test2, y_test2)

In [88]:
# model_5

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

# Logistic Rregression without Library

In [89]:
X_test3 = test_array
y_test3 = y2

In [90]:
# log_reg_2 = Logistic_Regression(learning_rate=0.9, no_of_iterations=10000)
# model_6 = log_reg_2 .fit(X_test3, y_test3)

# Test accuracy

In [91]:
test_predictions_1 = model_1.predict(X_test1)
print(accuracy_score(y_test1, test_predictions_1))

0.8902


In [92]:
test_predictions_2 = model_2.predict(X_test2)
print(accuracy_score(y_test2, test_predictions_2))

0.8902


In [93]:
test_predictions_3 = saved_model .predict(X_test3)
print(accuracy_score(y_test3, test_predictions_3))

0.8852


### Confusion Matrix [ [TP, FN] [FP, TN]]

In [94]:
test_df['label'].value_counts()

1    2505
0    2495
Name: label, dtype: int64

In [95]:
confusion_matrix(y_test1, test_predictions_1)

array([[2193,  302],
       [ 247, 2258]], dtype=int64)

In [96]:
confusion_matrix(y_test2, test_predictions_2)

array([[2193,  302],
       [ 247, 2258]], dtype=int64)

In [97]:
confusion_matrix(y_test3, test_predictions_3)

array([[2183,  312],
       [ 262, 2243]], dtype=int64)

### F1 = 2 * (precision * recall) / (precision + recall)

In [98]:
f1_score(y_train1, train_predictions_1)

0.9212725334126807

In [99]:
f1_score(y_train2, train_predictions_2)

0.9212725334126807

In [100]:
f1_score(y_train3, train_predictions_3)

0.8942212547001781

### Classification report

In [101]:
matrix_4 = classification_report(y_test1, test_predictions_1, labels=[1,0])
print('Classification report for train data 1 : \n', matrix_4)

Classification report for train data 1 : 
               precision    recall  f1-score   support

           1       0.88      0.90      0.89      2505
           0       0.90      0.88      0.89      2495

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



In [102]:
matrix_5 = classification_report(y_test2, test_predictions_2, labels=[1,0])
print('Classification report for train data 1 : \n', matrix_5)

Classification report for train data 1 : 
               precision    recall  f1-score   support

           1       0.88      0.90      0.89      2505
           0       0.90      0.88      0.89      2495

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



In [103]:
matrix_6 = classification_report(y_test3, test_predictions_3, labels=[1,0])
print('Classification report for test data 3 w/o library : \n',matrix_6)

Classification report for test data 3 w/o library : 
               precision    recall  f1-score   support

           1       0.88      0.90      0.89      2505
           0       0.89      0.87      0.88      2495

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



# Testing manually from user input

In [115]:
text = input("Review: ")
text = [text]

Review: This film starts out with all the moody promise of a great contemporary noir Western - after the ill-conceived opening flashback sequence anyway. The scenery is beautifully desolate, the characters achingly isolated. While some of the acting is less than believable, the plot ultimately delivers enough tension and twists to make this movie worth a look.


In [116]:
vec = TfidfVectorizer(min_df=0.0015, ngram_range=(1,3), vocabulary = tf.vocabulary_)
text_transform = vec .fit_transform(text)

In [117]:
text_array = text_transform .toarray()
text_array

array([[0., 0., 0., ..., 0., 0., 0.]])

In [118]:
sentiment_predict = saved_model .predict(text_array)

In [119]:
if sentiment_predict == 0:
    print("Negative sentiment")
else:
    print("Positive sentiment")

Positive sentiment
