## Machine learning models built on 3000 reviews pulled from 4000 reviews in such a way that all the negative reviews are present in the 3000 reviews along with the remaining to be positive.

## Machine learning models built are:
 ### 1. logistic regression using count vectorizer on raw data(69% f1-score on test data)
 ### 2. logistic regression using tfidf vectorizer on raw data (73% f1-score on test data)
 ### 3. logistic regression using count vectorier on preprocessed review
 ### 4. logistic regression using tfidf vectorizer on preprocessed review
 ### 5. Random forest on pre processed review
 ### 6. svm_grid search using count vectorizer on raw data
 ### 7. svm_grid search using tfidf vectorizer on raw data(78% f1 score)

## Initial Imports 

In [1]:
import requests
import time

## Creating header for review

In [2]:
headers = {'Referer':'https://www.rottentomatoes.com/m/the_lion_king_2019/reviews?type=user','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36','X-Requested-With':'XMLHttpRequest',}

In [3]:
headers

{'Referer': 'https://www.rottentomatoes.com/m/the_lion_king_2019/reviews?type=user',
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
 'X-Requested-With': 'XMLHttpRequest'}

In [4]:
url ='https://www.rottentomatoes.com/napi/movie/9057c2cf-7cab-317f-876f-e50b245ca76e/reviews/user' 

## Initial payload parameters  

In [5]:
payload = {'direction': 'next','endCursor': '','startCursor': ''}

## Creating session Object

In [6]:
s = requests.Session() 

## Code to fetch 1page review

In [7]:
r = s.get(url, headers=headers, params=payload)    # GET Call 
data = r.json()


In [8]:
data

{'pageInfo': {'hasNextPage': True,
  'hasPreviousPage': False,
  'endCursor': 'eyJyZWFsbV91c2VySWQiOiJSVF84ODM5MDExNDAiLCJlbXNJZCI6IjkwNTdjMmNmLTdjYWItMzE3Zi04NzZmLWU1MGIyNDVjYTc2ZSIsImVtc0lkX2hhc1Jldmlld0lzVmlzaWJsZSI6IjkwNTdjMmNmLTdjYWItMzE3Zi04NzZmLWU1MGIyNDVjYTc2ZV9UIiwiY3JlYXRlRGF0ZSI6IjIwMTktMDgtMjdUMTg6Mzg6NDQuOTIzWiJ9',
  'startCursor': None},
 'reviews': [{'rating': 'STAR_2_5',
   'review': "almost shot for shot. scar is a very weak character in this and trying to be too realistic you lose the magic of the original. it's safe. tilmon and pumba get the most laughs as they also have the most change.",
   'displayName': None,
   'displayImageUrl': None,
   'isVerified': False,
   'isSuperReviewer': False,
   'hasSpoilers': False,
   'hasProfanity': False,
   'createDate': '2019-08-27T21:39:31.395Z',
   'updateDate': '2019-08-27T21:39:31.395Z',
   'user': {'userId': '260316344',
    'realm': 'RT',
    'displayName': None,
    'accountLink': '/user/id/260316344'},
   'score': 2.5,


## Loop function for pulling out 3000 reviews from 300 pages

In [9]:
# using 3000 records
theLionKing_reviews=[]
endCursor= ''
startCursor= ''
for i in range(400):
    #payload initialization
    payload = {'direction':'next','endCursor': endCursor ,'startCursor': startCursor ,}
    #time interval of 8secs
    time.sleep(5) 
    #getting the review data in an order.
    r= s.get(url, params=payload, headers=headers)
    data = r.json()
    theLionKing_reviews = theLionKing_reviews + data['reviews']
    endCursor = data['pageInfo']['endCursor']
    startCursor = data['pageInfo']['startCursor']

theLionKing_reviews

[{'rating': 'STAR_2_5',
  'review': "almost shot for shot. scar is a very weak character in this and trying to be too realistic you lose the magic of the original. it's safe. tilmon and pumba get the most laughs as they also have the most change.",
  'displayName': None,
  'displayImageUrl': None,
  'isVerified': False,
  'isSuperReviewer': False,
  'hasSpoilers': False,
  'hasProfanity': False,
  'createDate': '2019-08-27T21:39:31.395Z',
  'updateDate': '2019-08-27T21:39:31.395Z',
  'user': {'userId': '260316344',
   'realm': 'RT',
   'displayName': None,
   'accountLink': '/user/id/260316344'},
  'score': 2.5,
  'timeFromCreation': '24m ago'},
 {'rating': 'STAR_5',
  'review': 'Loved the movie! Great CGI, acting, music, felt like I was watching the original all over again! I could go on and on but it’s a great family film! HIGHLY RECOMMEND',
  'displayName': 'Logan',
  'displayImageUrl': None,
  'isVerified': True,
  'isSuperReviewer': False,
  'hasSpoilers': False,
  'hasProfanity':

Type data in which reviews are extracted

In [10]:
type(theLionKing_reviews)

list

Saving the given data in "json" format as backup

In [11]:
import json
with open('theLionKing_reviews.json', 'w', encoding='utf-8') as f:
    json.dump(theLionKing_reviews, f, ensure_ascii=False, indent=4)

### Imports for other text preprocessing and classification

In [1]:
import urllib.request as url
from bs4 import BeautifulSoup as bs
import re
import requests
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random
random.seed(123)

checking the no of reviews in the data

In [13]:
len(theLionKing_reviews)

4000

### converting JSON data to pandas data

In [16]:
theLionKing_reviews

[{'rating': 'STAR_2_5',
  'review': "almost shot for shot. scar is a very weak character in this and trying to be too realistic you lose the magic of the original. it's safe. tilmon and pumba get the most laughs as they also have the most change.",
  'displayName': None,
  'displayImageUrl': None,
  'isVerified': False,
  'isSuperReviewer': False,
  'hasSpoilers': False,
  'hasProfanity': False,
  'createDate': '2019-08-27T21:39:31.395Z',
  'updateDate': '2019-08-27T21:39:31.395Z',
  'user': {'userId': '260316344',
   'realm': 'RT',
   'displayName': None,
   'accountLink': '/user/id/260316344'},
  'score': 2.5,
  'timeFromCreation': '24m ago'},
 {'rating': 'STAR_5',
  'review': 'Loved the movie! Great CGI, acting, music, felt like I was watching the original all over again! I could go on and on but it’s a great family film! HIGHLY RECOMMEND',
  'displayName': 'Logan',
  'displayImageUrl': None,
  'isVerified': True,
  'isSuperReviewer': False,
  'hasSpoilers': False,
  'hasProfanity':

In [17]:
from pandas.io.json import json_normalize
data =theLionKing_reviews

lionking_dt = pd.DataFrame.from_dict(json_normalize(data), orient='columns')

lionking_dt    

Unnamed: 0,rating,review,displayName,displayImageUrl,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,createDate,updateDate,score,timeFromCreation,user.userId,user.realm,user.displayName,user.accountLink
0,STAR_2_5,almost shot for shot. scar is a very weak char...,,,False,False,False,False,2019-08-27T21:39:31.395Z,2019-08-27T21:39:31.395Z,2.5,24m ago,260316344,RT,,/user/id/260316344
1,STAR_5,"Loved the movie! Great CGI, acting, music, fel...",Logan,,True,False,False,False,2019-08-27T21:13:12.178Z,2019-08-27T21:13:12.178Z,5.0,50m ago,02750CCE-D2AB-4E3B-A998-6576C1A240E0,Fandango,Logan,
2,STAR_3,If you grew up with (and love) the original an...,Rachel C,https://graph.facebook.com/v3.3/10221590382855...,False,False,False,False,2019-08-27T20:58:51.447Z,2019-08-27T20:58:51.447Z,3.0,1h ago,978214951,RT,Rachel C,/user/id/978214951
3,STAR_2_5,While it can take pride in its visual achievem...,Peter J,,False,False,False,False,2019-08-27T20:27:50.707Z,2019-08-27T20:27:54.169Z,2.5,2h ago,977694797,RT,Peter J,/user/id/977694797
4,STAR_5,It was seriously a stunning movie. Great graph...,Devon,,True,False,False,False,2019-08-27T20:15:05.755Z,2019-08-27T20:15:05.755Z,5.0,2h ago,AC172773-33D5-466B-8E82-C5178AB51CAF,Fandango,Devon,
5,STAR_5,I loved this remake. The cartoon is one of my ...,Samantha T,https://graph.facebook.com/v3.3/10000178407612...,False,False,False,False,2019-08-27T20:10:24.720Z,2019-08-27T20:10:24.720Z,5.0,2h ago,923215533,RT,Samantha T,/user/id/923215533
6,STAR_2,the only thing i thought was a great scene was...,saiyanelite,,True,False,False,False,2019-08-27T19:41:11.714Z,2019-08-27T19:42:52.104Z,2.0,2h ago,d452e337-0bf3-4df6-9a73-a37d5c2cfb94,Fandango,saiyanelite,
7,STAR_5,"Good movie, we enjoyed it a lot",Lisa,,True,False,False,False,2019-08-27T19:39:19.611Z,2019-08-27T19:39:19.611Z,5.0,2h ago,7aec84e7-5aec-418c-ac92-31e3fbdbe81e,Fandango,Lisa,
8,STAR_3,"I liked it, my kids loved it. I prefer the ori...",Levy M,https://graph.facebook.com/v3.3/1112244759/pic...,False,False,False,False,2019-08-27T19:03:20.510Z,2019-08-27T19:03:20.510Z,3.0,3h ago,823571697,RT,Levy M,/user/id/823571697
9,STAR_2,😁I just saw the Lion King and I have to say......,Xavier P,https://graph.facebook.com/v3.3/1320956777/pic...,False,False,False,True,2019-08-27T18:38:44.923Z,2019-08-27T18:38:44.923Z,2.0,3h ago,883901140,RT,Xavier P,/user/id/883901140


In [18]:
lionking_dt.to_csv("4000reviews.csv",header=True,index=False)

In [2]:
lionking_dt=pd.read_csv("4000reviews.csv")

In [3]:
#extracting our target feature from rating
lionking_dt['sentiment'] = np.where(lionking_dt['score']>3, 0, 1)

In [134]:
lionking_dt.isna().sum()

review                0
sentiment             0
review_processed_1    0
dtype: int64

In [4]:
lionking_dt.sentiment.value_counts()

0    2923
1    1077
Name: sentiment, dtype: int64

In [5]:
#collecting and increasing the no of negative records in the 3000 reviews by-
#sampling postive and negative records seperatly from 4000 records
positive_dt = lionking_dt[lionking_dt.sentiment == 0].sample(n=1923)
negative_dt = lionking_dt[lionking_dt.sentiment == 1].sample(n=1077)

In [6]:
lionking_dt= pd.concat([positive_dt,negative_dt],axis=0,ignore_index=True)

In [7]:
lionking_dt.sentiment.value_counts(normalize=True)

0    0.641
1    0.359
Name: sentiment, dtype: float64

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
final_test_dt = pd.read_csv("test.csv")

In [10]:
type(final_test_dt)

pandas.core.frame.DataFrame

In [11]:
#type of data
type(lionking_dt["review"])

pandas.core.series.Series

In [12]:
#seeing the first two reviews
lionking_dt["review"][:2]

0    It was just as good as the original. The anima...
1    They stayed true to the cartoon except for the...
Name: review, dtype: object

In [13]:
CONTRACTION_MAP = {"ain't": "is not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he he will have","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","i'd": "i would","i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have","isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is","should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so as","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there would","there'd've": "there would have","there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would","we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have","where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have","you're": "you are","you've": "you have"}

In [14]:
### Contractions
import re


def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
   
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)

    return expanded_text


In [15]:
## for extracting polarity, review length, word count features from the data after pre processing
from textblob import TextBlob

def preprocess(Text):
    Text = Text.str.replace("(<br/>)", "")
    Text = Text.str.replace('(<a).*(>).*(</a>)', '')
    Text = Text.str.replace('(&amp)', '')
    Text = Text.str.replace('(&gt)', '')
    Text = Text.str.replace('(&lt)', '')
    Text = Text.str.replace('(\xa0)', ' ')  
    return Text
lionking_dt['review_processed_1'] = preprocess(lionking_dt['review'])

lionking_dt['polarity'] = lionking_dt['review_processed_1'].map(lambda text: TextBlob(text).sentiment.polarity)
lionking_dt['review_len'] = lionking_dt['review_processed_1'].astype(str).apply(len)
lionking_dt['word_count'] = lionking_dt['review_processed_1'].apply(lambda x: len(str(x).split()))

In [16]:
# pre processing of the review data an d assigning it to new feature column review_processed_2
lionking_dt["review_processed_1"] = [expand_contractions(re.sub('’',"'",i)) for i in lionking_dt['review']]
# pre processing of the review data an d assigning it to new feature column review_processed_2
final_test_dt["review_processed_1"] = [expand_contractions(re.sub('’',"'",i)) for i in final_test_dt.Review]

In [199]:
lionking_dt.review_processed_1[2323]

'The voice acting was not great, the songs were awkward and forced, and this movie left me feeling disappointed. '

In [17]:
#removing special characters from the review data and assigning it to a new column feature review_processed_1
lionking_dt['review_processed_1']=[re.sub(r"[^a-zA-Z#\s]",' ',i)for i in lionking_dt["review_processed_1"]]
final_test_dt['review_processed_1']=[re.sub(r"[^a-zA-Z#\s]",' ',i)for i in final_test_dt["review_processed_1"]]

In [18]:
lionking_dt.review_processed_1[2323]

'Without having seen the original  I would probably think even less of this movie  I got through bad parts scenes that did not make much sense by remembering the old version  However  it looks incredible  and the  D is fantastic '

In [19]:
lionking_dt['review_processed_1'][1121]

'Much better than I expected   the start of the movie was amazing '

In [20]:
#converting all the upper case letters to lower case after stripping
lionking_dt['review_processed_1']=[each_review.strip().lower() for each_review in lionking_dt['review_processed_1']]
final_test_dt['review_processed_1']=[each_review.strip().lower() for each_review in final_test_dt['review_processed_1']]


In [21]:
lionking_dt.review_processed_1[2323]

'without having seen the original  i would probably think even less of this movie  i got through bad parts scenes that did not make much sense by remembering the old version  however  it looks incredible  and the  d is fantastic'

In [22]:
#removing stopwords in the review text
stop = stopwords.words('english')
lionking_dt['review_processed_1']=lionking_dt['review_processed_1'].apply(lambda x: ' '.join([i for i in x.split() if (len(i)>3)&(i.lower() not in stop)]))
final_test_dt['review_processed_1']=final_test_dt['review_processed_1'].apply(lambda x: ' '.join([i for i in x.split() if (len(i)>3)&(i.lower() not in stop)]))

In [23]:
lionking_dt.review_processed_1[2323]

'without seen original would probably think even less movie parts scenes make much sense remembering version however looks incredible fantastic'

In [24]:
lionking_dt['review_processed_1'][1121]

'much better expected start movie amazing'

In [25]:
#splitting the words into root form
lionking_dt['review_processed_1']= lionking_dt['review_processed_1'].apply(lambda x: x.split())
final_test_dt['review_processed_1']= final_test_dt['review_processed_1'].apply(lambda x: x.split())

In [26]:
lionking_dt['review_processed_1'][2323]

['without',
 'seen',
 'original',
 'would',
 'probably',
 'think',
 'even',
 'less',
 'movie',
 'parts',
 'scenes',
 'make',
 'much',
 'sense',
 'remembering',
 'version',
 'however',
 'looks',
 'incredible',
 'fantastic']

In [27]:
#converting words into rootform by lemmatizing
import nltk
lmtzr = nltk.stem.WordNetLemmatizer()
lionking_dt['review_processed_1']=lionking_dt['review_processed_1'].apply(lambda x: [lmtzr.lemmatize(i) for i in x])
final_test_dt['review_processed_1']=final_test_dt['review_processed_1'].apply(lambda x: [lmtzr.lemmatize(i) for i in x])

In [28]:
lionking_dt['review_processed_1'][2323]

['without',
 'seen',
 'original',
 'would',
 'probably',
 'think',
 'even',
 'le',
 'movie',
 'part',
 'scene',
 'make',
 'much',
 'sense',
 'remembering',
 'version',
 'however',
 'look',
 'incredible',
 'fantastic']

In [29]:
lionking_dt['review_processed_1'] = lionking_dt['review_processed_1'].apply(lambda x:' '.join(x))
final_test_dt['review_processed_1'] = final_test_dt['review_processed_1'].apply(lambda x:' '.join(x))

In [135]:
lionking_dt.to_pickle("4000reviews_preprocessed.csv")
final_test_dt.to_pickle("final_preprocessed.csv")



In [30]:
from sklearn.model_selection import train_test_split

X = lionking_dt.review
y = lionking_dt.sentiment

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X,y,test_size=0.2,random_state=123)

In [31]:
X = lionking_dt.review_processed_1
y = lionking_dt.sentiment

X_train_r1, X_test_r1, y_train_r1, y_test_r1 = train_test_split(X,y,test_size=0.2,random_state=123)

In [32]:
## Import Libraries 
import numpy as np      # for array operations
import pandas as pd     # for reading data operations

from keras.preprocessing.text import Tokenizer          # for tokenizing text
from keras.preprocessing.sequence import pad_sequences  # for padding sentences with zeros. To make the sentence length same
from keras.utils import to_categorical                  # for one-hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import SimpleRNN, LSTM, GRU, Input, Concatenate
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Embedding, GlobalAvgPool1D
from keras.models import Sequential, Model
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.collocations import BigramCollocationFinder 
from wordcloud import WordCloud

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth',-1)

import urllib.request as url
from bs4 import BeautifulSoup as bs
import re
import requests
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random
random.seed(123)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import os
import random
import string
import datetime as dt

# import warnings
# warnings.filterwarnings('ignore','RuntimeWarning')

import nltk
import re
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from wordcloud import WordCloud

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
import nltk

from sklearn.metrics import f1_score


Using TensorFlow backend.


In [33]:
lionking_dt.columns

Index(['rating', 'review', 'displayName', 'displayImageUrl', 'isVerified',
       'isSuperReviewer', 'hasSpoilers', 'hasProfanity', 'createDate',
       'updateDate', 'score', 'timeFromCreation', 'user.userId', 'user.realm',
       'user.displayName', 'user.accountLink', 'sentiment',
       'review_processed_1', 'polarity', 'review_len', 'word_count'],
      dtype='object')

In [34]:
lionking_dt = lionking_dt.drop(labels=['rating', 'displayName', 'displayImageUrl', 'isVerified',
       'isSuperReviewer', 'hasSpoilers', 'hasProfanity', 'createDate',
       'updateDate', 'score', 'timeFromCreation', 'user.userId', 'user.realm',
       'user.displayName', 'user.accountLink', 'polarity', 'review_len', 'word_count'], axis=1)

In [35]:
lionking_dt.sample()

Unnamed: 0,review,sentiment,review_processed_1
210,Just like the childhood movie I love.,0,like childhood movie love


# For  Raw Review (cv & Tfidf)

In [104]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r = cv.fit_transform(X_train_r)
tdm_test_r = cv.transform(X_test_r)
tdm_final_test_dt=cv.transform(final_test_dt.Review)
##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r = tfidf_vectorizer.fit_transform(X_train_r)
tfidf_test_r = tfidf_vectorizer.transform(X_test_r)
tfidf_final_test_dt = tfidf_vectorizer.transform(final_test_dt.Review)


In [105]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
# tdm_train_r1 = cv.fit_transform(X_train_r1)
# tdm_test_r1 = cv.transform(X_test_r1)

# ##########################################################################################
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
# tfidf_train_r1 = tfidf_vectorizer.fit_transform(X_train_r1)
# tfidf_test_r1 = tfidf_vectorizer.transform(X_test_r1)


## Logistic regression
    1.count vectorizer

In [106]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tdm_train_r,y_train_r) #or mat which is in dense format can also be used

#prediction on train data
train_pred_log_tdm_r = lr_clf.predict(tdm_train_r)

#predicting on test data
test_pred_log_tdm_r = lr_clf.predict(tdm_test_r)

In [107]:
f1_train_r_log=f1_score( y_train_r, train_pred_log_tdm_r, labels=None, pos_label=1, average='binary')
f1_test_r_log=f1_score( y_test_r, test_pred_log_tdm_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_log)
print(f1_test_r_log)

0.9235400361228175
0.7425742574257426


## logistic regression tfidf vectorizer 

In [108]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tfidf_train_r,y_train_r) #or mat which is in dense format can also be used

#prediction on train data
train_pred_log_tf_r = lr_clf.predict(tfidf_train_r)

#predicting on test data
test_pred_log_tf_r = lr_clf.predict(tfidf_test_r)

In [109]:
f1_train_r_log=f1_score( y_train_r, train_pred_log_tf_r, labels=None, pos_label=1, average='binary')
f1_test_r_log=f1_score( y_test_r, test_pred_log_tf_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_log)
print(f1_test_r_log)

0.9241126070991433
0.7575757575757576


In [110]:
# >>> param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
# >>> clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
# GridSearchCV(cv=None,
#              estimator=LogisticRegression(C=1.0, intercept_scaling=1,   
#                dual=False, fit_intercept=True, penalty='l2', tol=0.0001),
#              param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})

## Grid search on logistic regression 

In [119]:
## Use Grid Search for parameter tuning

from sklearn.model_selection import GridSearchCV

 

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],}

 
log_grid_r= GridSearchCV(estimator = LogisticRegression(penalty='l2'), param_grid = param_grid, cv =None)

In [112]:
#training the model
lr_clf = log_grid_r.fit(tdm_train_r,y_train_r) #or mat which is in dense format can also be used

#prediction on train data
train_pred_log_tdm__grid_r = lr_clf.predict(tdm_train_r)

#predicting on test data
test_pred_log_tdm_grid_r = lr_clf.predict(tdm_test_r)

In [113]:
f1_train_r_log=f1_score( y_train_r, train_pred_log_tdm__grid_r, labels=None, pos_label=1, average='binary')
f1_test_r_log=f1_score( y_test_r, test_pred_log_tdm_grid_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_log)
print(f1_test_r_log)

0.9235400361228175
0.7425742574257426


In [120]:
lr_clf = log_grid_r.fit(tfidf_train_r,y_train_r) #or mat which is in dense format can also be used

#prediction on train data
train_pred_log_tf_grid_r = lr_clf.predict(tfidf_train_r)

#predicting on test data
test_pred_log_tf_grid_r = lr_clf.predict(tfidf_test_r)

In [121]:
f1_train_r_log=f1_score( y_train_r, train_pred_log_tf_grid_r, labels=None, pos_label=1, average='binary')
f1_test_r_log=f1_score( y_test_r, test_pred_log_tf_grid_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_log)
print(f1_test_r_log)

1.0
0.781038374717833


In [128]:
final_predict_m4=log_grid_r.predict(tfidf_final_test_dt)

In [129]:
final_predict_m4

array([0, 0, 0, ..., 1, 0, 1])

In [130]:
pd.Series(final_predict_m2).value_counts()

0    878
1    322
dtype: int64

In [131]:
final_test_dt.sample

<bound method NDFrame.sample of       ReviewID  \
0     92876      
1     92877      
2     92878      
3     92879      
4     92880      
...     ...      
1195  94071      
1196  94072      
1197  94073      
1198  94074      
1199  94075      

                                                                                                                                                                                                                                                                                                                     Review  \
0     Was good. Nothing like the original but I believe that was the point.                                                                                                                                                                                                                                                   
1     I absolutely loved it! A wonderful rendition of the original. Just dont compare to the cartoon version. Let

In [132]:
final_test_dt['sentiment']=final_predict_m

f11=final_test_dt.drop(labels=['Review','review_processed_1'], axis=1)

f11.sample(20)

f11.to_csv("model29.csv",header=True,index=False)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r1 = cv.fit_transform(X_train_r1)
tdm_test_r1 = cv.transform(X_test_r1)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r1 = tfidf_vectorizer.fit_transform(X_train_r1)
tfidf_test_r1 = tfidf_vectorizer.transform(X_test_r1)


In [59]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tdm_train_r1,y_train_r1) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r1 = lr_clf.predict(tdm_train_r1)

#predicting on test data
test_pred_r1 = lr_clf.predict(tdm_test_r1)

In [60]:
f1_train_r1_log=f1_score( y_train_r1, train_pred_r1, labels=None, pos_label=1, average='binary')
f1_test_r1_log=f1_score( y_test_r1, test_pred_r1, labels=None, pos_label=1, average='binary')
print(f1_train_r1_log)
print(f1_test_r1_log)

0.9073288915808602
0.7461928934010152


In [61]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tfidf_train_r1,y_train_r1) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r1 = lr_clf.predict(tfidf_train_r1)

#predicting on test data
test_pred_r1 = lr_clf.predict(tfidf_test_r1)

In [62]:
f1_train_r1_log=f1_score( y_train_r1, train_pred_r1, labels=None, pos_label=1, average='binary')
f1_test_r1_log=f1_score( y_test_r1, test_pred_r1, labels=None, pos_label=1, average='binary')
print(f1_train_r1_log)
print(f1_test_r1_log)

0.9093167701863354
0.7239583333333333


#### random forest

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r = cv.fit_transform(X_train_r)
tdm_test_r = cv.transform(X_test_r)
tdm_final_test_dt=cv.transform(final_test_dt.Review)
##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r = tfidf_vectorizer.fit_transform(X_train_r)
tfidf_test_r = tfidf_vectorizer.transform(X_test_r)
tfidf_final_test_dt = tfidf_vectorizer.transform(final_test_dt.Review)


In [64]:
#build a random forest classifiers
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()


In [65]:
rf_clf = rf.fit(tdm_train_r,y_train_r)
#prediction on train data
train_pred_r = rf_clf.predict(tdm_train_r)

#predicting on test data
test_pred_r = rf_clf.predict(tdm_test_r)

In [66]:
f1_train_r_rf=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_rf=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_rf)
print(f1_test_r_rf)

0.9769094138543517
0.5754189944134079


In [67]:
rf_clf = rf.fit(tfidf_train_r,y_train_r)
#prediction on train data
train_pred_r = rf_clf.predict(tfidf_train_r)

#predicting on test data
test_pred_r = rf_clf.predict(tfidf_test_r)

In [68]:
f1_train_r_rf=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_rf=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_rf)
print(f1_test_r_rf)

0.9707112970711297
0.513595166163142


#### svm/cv

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r = cv.fit_transform(X_train_r)
tdm_test_r = cv.transform(X_test_r)
tdm_final_test_dt=cv.transform(final_test_dt.Review)
##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r = tfidf_vectorizer.fit_transform(X_train_r)
tfidf_test_r = tfidf_vectorizer.transform(X_test_r)
tfidf_final_test_dt = tfidf_vectorizer.transform(final_test_dt.Review)


In [49]:
## Build a SVM Classifier
from sklearn.svm import SVC

## Create an SVC object and print it to see the default arguments
svc = SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [50]:
svc_c10_rbf = SVC(C=10,kernel='rbf')
svc_c10_rbf

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [51]:
## Fit the model svc_c10_rbf on the train data (X_train,y_train)
svc_c10_rbf.fit(X = tdm_train_r,y = y_train_r)

#prediction on train data
train_pred_r = svc_c10_rbf.predict(tdm_train_r)

#predicting on test data
test_pred_r = svc_c10_rbf.predict(tdm_test_r)

In [52]:
f1_train_r_svc=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_svc=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_svc)
print(f1_test_r_svc)

0.30531400966183575
0.2421875


#### Grid search on svc

In [144]:
## Use Grid Search for parameter tuning

from sklearn.model_selection import GridSearchCV

svc_grid = SVC()
 

param_grid = {

'C': [0.001, 0.01, 0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1, 1], 
'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

 
svc_cv_grid = GridSearchCV(estimator = svc_grid, param_grid = param_grid, cv = 10)

In [145]:
## Fit the grid search model
svc_cv_grid.fit(X = tfidf_train_r1, y = y_train_r1)

train_pred_r1 = svc_cv_grid.predict(tfidf_train_r1)

test_pred_r1 = svc_cv_grid.predict(tfidf_test_r1)

In [146]:
## Print best score and parameters
print(svc_cv_grid.best_score_,svc_cv_grid.best_params_)

0.82875 {'C': 1, 'gamma': 0.001, 'kernel': 'linear'}


In [150]:
svc_clf_grid = SVC(C=10, gamma=0.001, kernel= 'linear')

In [151]:
## Fit the grid search model
svc_clf_grid.fit(X = tfidf_train_r1, y = y_train_r)

train_pred_r = svc_clf_grid.predict(tfidf_train_r1)

test_pred_r = svc_clf_grid.predict(tfidf_test_r1)

In [152]:
f1_train_r_svc=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_svc=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_svc)
print(f1_test_r_svc)

0.9982507288629737
0.724220623501199


In [218]:
## Fit the grid search model
svc_cv_grid.fit(X = tfidf_train_r1, y = y_train_r1)

train_pred_r1_tf = svc_cv_grid.predict(tfidf_train_r1)

test_pred_r1_tf = svc_cv_grid.predict(tfidf_test_r1)

In [219]:
f1_train_r1_svc=f1_score( y_train_r1, train_pred_r1, labels=None, pos_label=1, average='binary')
f1_test_r1_svc=f1_score( y_test_r1, test_pred_r1, labels=None, pos_label=1, average='binary')
print(f1_train_r1_svc)
print(f1_test_r1_svc)

0.8653500897666068
0.6257668711656441


In [220]:
## Print best score and parameters
print(svc_cv_grid.best_score_,svc_cv_grid.best_params_)

0.8257777777777778 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [72]:
svc_clf_grid = SVC(C=7.1, gamma=0.1, kernel= 'rbf')

In [73]:
## Fit the grid search model
svc_clf_grid.fit(X = tfidf_train_r, y = y_train_r)

train_pred_r = svc_clf_grid.predict(tfidf_train_r)

test_pred_r = svc_clf_grid.predict(tfidf_test_r)

In [74]:
f1_train_r_svc=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_svc=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_svc)
print(f1_test_r_svc)

0.9935785172212492
0.7835990888382689


In [226]:
final_test_ids_dt=pd.read_csv("test.csv")


In [227]:
tfidf_final_test_2= tfidf_vectorizer.transform(final_test_ids_dt)

In [75]:
final_predict_m=svc_clf_grid.predict(tfidf_final_test_dt)

In [76]:
final_predict_m

array([0, 0, 0, ..., 1, 0, 1])

In [77]:
pd.Series(final_predict_m).value_counts()

0    878
1    322
dtype: int64

In [78]:
final_test_dt.sample

<bound method NDFrame.sample of       ReviewID  \
0     92876      
1     92877      
2     92878      
3     92879      
4     92880      
...     ...      
1195  94071      
1196  94072      
1197  94073      
1198  94074      
1199  94075      

                                                                                                                                                                                                                                                                                                                     Review  \
0     Was good. Nothing like the original but I believe that was the point.                                                                                                                                                                                                                                                   
1     I absolutely loved it! A wonderful rendition of the original. Just dont compare to the cartoon version. Let

In [80]:
final_test_dt['sentiment']=final_predict_m

f11=final_test_dt.drop(labels=['Review','review_processed_1'], axis=1)

f11.sample(20)

f11.to_csv("model27.csv",header=True,index=False)

## review processed 1

In [60]:
from sklearn.model_selection import train_test_split
X_train_r2, X_test_r2, y_train_r2, y_test_r2 = train_test_split(lionking_dt['review_processed_1'],lionking_dt['sentiment'],test_size=0.20,random_state=123)

In [61]:
print(len(X_train_r2))
print(len(X_test_r2))

2400
600


In [62]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r2 = cv.fit_transform(X_train_r2)
tdm_test_r2 = cv.transform(X_test_r2)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r2 = tfidf_vectorizer.fit_transform(X_train_r2)
tfidf_test_r2 = tfidf_vectorizer.transform(X_test_r2)


In [63]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tdm_train_r2,y_train_r2) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r2 = lr_clf.predict(tdm_train_r2)

#predicting on test data
test_pred_r2 = lr_clf.predict(tdm_test_r2)

In [64]:
print("train cfm :","\n", confusion_matrix(y_train_r2,train_pred_r2))
print("train cfm :","\n", confusion_matrix(y_test_r2,test_pred_r2))

train cfm : 
 [[1499   42]
 [ 111  748]]
train cfm : 
 [[334  48]
 [ 73 145]]


In [65]:
f1_train_r2_log=f1_score( y_train_r2, train_pred_r2, labels=None, pos_label=1, average='binary')
f1_test_r2_log=f1_score( y_test_r2, test_pred_r2, labels=None, pos_label=1, average='binary')
print(f1_train_r2_log)
print(f1_test_r2_log)

0.9072164948453609
0.7055961070559612


#### tfidf

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r2 = cv.fit_transform(X_train_r2)
tdm_test_r2 = cv.transform(X_test_r2)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r2 = tfidf_vectorizer.fit_transform(X_train_r2)
tfidf_test_r2 = tfidf_vectorizer.transform(X_test_r2)


In [70]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tfidf_train_r2,y_train_r2) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r2 = lr_clf.predict(tfidf_train_r2)

#predicting on test data
test_pred_r2 = lr_clf.predict(tfidf_test_r2)

In [71]:
print("train cfm :","\n", confusion_matrix(y_train_r2,train_pred_r2))
print("train cfm :","\n", confusion_matrix(y_test_r2,test_pred_r2))

train cfm : 
 [[1521   20]
 [ 123  736]]
train cfm : 
 [[346  36]
 [ 87 131]]


In [72]:
f1_train_r2_log=f1_score( y_train_r2, train_pred_r2, labels=None, pos_label=1, average='binary')
f1_test_r2_log=f1_score( y_test_r2, test_pred_r2, labels=None, pos_label=1, average='binary')
print(f1_train_r2_log)
print(f1_test_r2_log)

0.9114551083591331
0.6805194805194804


#### random forest

In [73]:
#build a random forest classifiers
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()


In [74]:
rf_clf = rf.fit(tdm_train_r2,y_train_r2)
#prediction on train data
train_pred_r2 = rf_clf.predict(tdm_train_r2)

#predicting on test data
test_predr2 = rf_clf.predict(tdm_test_r2)

In [75]:
f1_train_r2_rf=f1_score( y_train_r2, train_pred_r2, labels=None, pos_label=1, average='binary')
f1_test_r2_rf=f1_score( y_test_r2, test_pred_r2, labels=None, pos_label=1, average='binary')
print(f1_train_r2_rf)
print(f1_test_r2_rf)

0.9775147928994083
0.6805194805194804


#### svm

In [76]:
## Build a SVM Classifier
from sklearn.svm import SVC

## Create an SVC object and print it to see the default arguments
svc = SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [77]:
svc_c10_rbf = SVC(C=10,kernel='rbf')
svc_c10_rbf

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [78]:
## Fit the model svc_c10_rbf on the train data (X_train,y_train)
svc_c10_rbf.fit(X = tdm_train_r2,y = y_train_r2)

#prediction on train data
train_pred_r2 = svc_c10_rbf.predict(tdm_train_r2)

#predicting on test data
test_pred_r2 = svc_c10_rbf.predict(tdm_test_r2)

In [79]:
f1_train_r2_svc=f1_score( y_train_r2, train_pred_r2, labels=None, pos_label=1, average='binary')
f1_test_r2_svc=f1_score( y_test_r2, test_pred_r2, labels=None, pos_label=1, average='binary')
print(f1_train_r2_svc)
print(f1_test_r2_svc)

0.32887189292543023
0.234375


#### Grid search on svc

In [80]:
## Use Grid Search for parameter tuning

from sklearn.model_selection import GridSearchCV

svc_grid = SVC()
 

param_grid = {

'C': [0.001, 0.01, 0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1, 1], 
'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

 
svc_cv_grid = GridSearchCV(estimator = svc_grid, param_grid = param_grid, cv = 10)

In [224]:
## Fit the grid search model
svc_cv_grid.fit(X = tdm_train_r1, y = y_train_r1)

train_pred_r1 = svc_cv_grid.predict(tdm_train_r1)

test_pred_r1 = svc_cv_grid.predict(tdm_test_r1)

In [214]:
## Print best score and parameters
print(svc_cv_grid.best_score_,svc_cv_grid.best_params_)

0.8093333333333333 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [81]:
svc_clf_grid = SVC(C=10, gamma=0.01, kernel= 'rbf')

In [84]:
## Fit the grid search model
svc_clf_grid.fit(X = tdm_train_r1, y = y_train_r1)

train_pred_r1 = svc_clf_grid.predict(tdm_train_r1)

test_pred_r1= svc_clf_grid.predict(tdm_test_r1)

In [83]:
f1_train_r2_svc=f1_score( y_train_r2, train_pred_r2, labels=None, pos_label=1, average='binary')
f1_test_r2_svc=f1_score( y_test_r2, test_pred_r2, labels=None, pos_label=1, average='binary')
print(f1_train_r2_svc)
print(f1_test_r2_svc)

0.8902589395807645
0.7076923076923076


In [218]:
# ## Fit the grid search model
# svc_cv_grid.fit(X = tfidf_train_r1, y = y_train_r1)

# train_pred_r1_tf = svc_cv_grid.predict(tfidf_train_r1)

# test_pred_r1_tf = svc_cv_grid.predict(tfidf_test_r1)

In [85]:
f1_train_r_svc=f1_score( y_train_r1, train_pred_r1, labels=None, pos_label=1, average='binary')
f1_test_r_svc=f1_score( y_test_r1, test_pred_r1, labels=None, pos_label=1, average='binary')
print(f1_train_r_svc)
print(f1_test_r_svc)

0.8899876390605685
0.7253886010362695


In [220]:
# ## Print best score and parameters
# print(svc_cv_grid.best_score_,svc_cv_grid.best_params_)

0.8257777777777778 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [86]:
svc_clf_grid = SVC(C=10, gamma=0.1, kernel= 'rbf')

In [87]:
## Fit the grid search model
svc_clf_grid.fit(X = tfidf_train_r2, y = y_train_r2)

train_pred_r2_svc = svc_clf_grid.predict(tfidf_train_r2)

test_pred_r2_svc = svc_clf_grid.predict(tfidf_test_r2)

In [88]:
f1_train_r2_svc=f1_score( y_train_r2, train_pred_r2_svc, labels=None, pos_label=1, average='binary')
f1_test_r2_svc=f1_score( y_test_r2, test_pred_r2_svc, labels=None, pos_label=1, average='binary')
print(f1_train_r2_svc)
print(f1_test_r2_svc)

0.9941724941724941
0.7327188940092166


### review processed 3

In [136]:
from sklearn.model_selection import train_test_split
X_train_r3, X_test_r3, y_train_r3, y_test_r3 = train_test_split(train_text_dt['review_processed_3'],train_text_dt['sentiment'],test_size=0.25,random_state=123)

In [137]:
print(len(X_train_r3))
print(len(X_test_r3))

2250
750


In [138]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r3 = cv.fit_transform(X_train_r3)
tdm_test_r3 = cv.transform(X_test_r3)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r3 = tfidf_vectorizer.fit_transform(X_train_r3)
tfidf_test_r3 = tfidf_vectorizer.transform(X_test_r3)


In [141]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tdm_train_r3,y_train_r3) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r3 = lr_clf.predict(tdm_train_r3)

#predicting on test data
test_pred_r3 = lr_clf.predict(tdm_test_r3)

In [142]:
print("train cfm :","\n", confusion_matrix(y_train_r3,train_pred_r3))
print("train cfm :","\n", confusion_matrix(y_test_r3,test_pred_r3))

train cfm : 
 [[1610   16]
 [ 105  519]]
train cfm : 
 [[511  40]
 [ 87 112]]


In [153]:
f1_train_r3_log=f1_score( y_train_r3, train_pred_r3, labels=None, pos_label=1, average='binary')
f1_test_r3_log=f1_score( y_test_r3, test_pred_r3, labels=None, pos_label=1, average='binary')
print(f1_train_r3_log)
print(f1_test_r3_log)

0.8955996548748921
0.6381766381766382


#### tfidf

In [166]:
from sklearn.model_selection import train_test_split
X_train_r3, X_test_r3, y_train_r3, y_test_r3 = train_test_split(train_text_dt['review_processed_3'],train_text_dt['sentiment'],test_size=0.25,random_state=123)

In [167]:
print(len(X_train_r3))
print(len(X_test_r3))

2250
750


In [168]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r3 = cv.fit_transform(X_train_r3)
tdm_test_r3 = cv.transform(X_test_r3)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r3 = tfidf_vectorizer.fit_transform(X_train_r3)
tfidf_test_r3 = tfidf_vectorizer.transform(X_test_r3)


In [169]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tfidf_train_r3,y_train_r3) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r3 = lr_clf.predict(tfidf_train_r3)

#predicting on test data
test_pred_r3 = lr_clf.predict(tfidf_test_r3)

In [170]:
print("train cfm :","\n", confusion_matrix(y_train_r3,train_pred_r3))
print("train cfm :","\n", confusion_matrix(y_test_r3,test_pred_r3))

train cfm : 
 [[1622    4]
 [ 259  365]]
train cfm : 
 [[537  14]
 [132  67]]


In [171]:
f1_train_r3_log=f1_score( y_train_r3, train_pred_r3, labels=None, pos_label=1, average='binary')
f1_test_r3_log=f1_score( y_test_r3, test_pred_r3, labels=None, pos_label=1, average='binary')
print(f1_train_r3_log)
print(f1_test_r3_log)

0.7351460221550856
0.47857142857142865


#### random forest

In [186]:
#build a random forest classifiers
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()


In [187]:
rf_clf = rf.fit(tdm_train_r3,y_train_r3)
#prediction on train data
train_pred_r3 = rf_clf.predict(tdm_train_r3)

#predicting on test data
test_pred_r3 = rf_clf.predict(tdm_test_r3)

In [188]:
f1_train_r3_rf=f1_score( y_train_r3, train_pred_r3, labels=None, pos_label=1, average='binary')
f1_test_r3_rf=f1_score( y_test_r3, test_pred_r3, labels=None, pos_label=1, average='binary')
print(f1_train_r3_rf)
print(f1_test_r3_rf)

0.9713349713349713
0.47741935483870973


#### svm

In [199]:
## Build a SVM Classifier
from sklearn.svm import SVC

## Create an SVC object and print it to see the default arguments
svc = SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [200]:
svc_c10_rbf = SVC(C=10,kernel='rbf')
svc_c10_rbf

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [201]:
## Fit the model svc_c10_rbf on the train data (X_train,y_train)
svc_c10_rbf.fit(X = tdm_train_r3,y = y_train_r3)

#prediction on train data
train_pred_r3 = svc_c10_rbf.predict(tdm_train_r3)

#predicting on test data
test_pred_r3 = svc_c10_rbf.predict(tdm_test_r3)

In [202]:
f1_train_r3_svc=f1_score( y_train_r3, train_pred_r3, labels=None, pos_label=1, average='binary')
f1_test_r3_svc=f1_score( y_test_r3, test_pred_r3, labels=None, pos_label=1, average='binary')
print(f1_train_r3_svc)
print(f1_test_r3_svc)

0.16593886462882096
0.12962962962962962


#### Grid search on svc

In [238]:
## Use Grid Search for parameter tuning

from sklearn.model_selection import GridSearchCV

svc_grid = SVC()
 

param_grid = {

'C': [0.001, 0.01, 0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1, 1], 
'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

 
svc_cv_grid = GridSearchCV(estimator = svc_grid, param_grid = param_grid, cv = 10)

In [224]:
# ## Fit the grid search model
# svc_cv_grid.fit(X = tdm_train_r3, y = y_train_r3)

# train_pred_r3 = svc_cv_grid.predict(tdm_train_r3)

# test_pred_r3 = svc_cv_grid.predict(tdm_test_r3)

In [214]:
# ## Print best score and parameters
# print(svc_cv_grid.best_score_,svc_cv_grid.best_params_)

0.8093333333333333 {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [239]:
svc_clf_grid = SVC(C=10, gamma=0.01, kernel= 'rbf')

In [240]:
## Fit the grid search model
svc_clf_grid.fit(X = tdm_train_r3, y = y_train_r3)

train_pred_r3 = svc_clf_grid.predict(tdm_train_r3)

test_pred_r3 = svc_clf_grid.predict(tdm_test_r3)

In [241]:
f1_train_r3_svc=f1_score( y_train_r3, train_pred_r3, labels=None, pos_label=1, average='binary')
f1_test_r3_svc=f1_score( y_test_r3, test_pred_r3, labels=None, pos_label=1, average='binary')
print(f1_train_r3_svc)
print(f1_test_r3_svc)

0.875886524822695
0.6312684365781711


In [218]:
# ## Fit the grid search model
# svc_cv_grid.fit(X = tfidf_train_r1, y = y_train_r1)

# train_pred_r1_tf = svc_cv_grid.predict(tfidf_train_r1)

# test_pred_r1_tf = svc_cv_grid.predict(tfidf_test_r1)

In [219]:
# f1_train_r1_svc=f1_score( y_train_r1, train_pred_r1, labels=None, pos_label=1, average='binary')
# f1_test_r1_svc=f1_score( y_test_r1, test_pred_r1, labels=None, pos_label=1, average='binary')
# print(f1_train_r1_svc)
# print(f1_test_r1_svc)

0.8653500897666068
0.6257668711656441


In [220]:
# ## Print best score and parameters
# print(svc_cv_grid.best_score_,svc_cv_grid.best_params_)

0.8257777777777778 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [263]:
svc_clf_grid = SVC(C=8, gamma=0.1, kernel= 'rbf')

In [264]:
## Fit the grid search model
svc_clf_grid.fit(X = tfidf_train_r3, y = y_train_r3)

train_pred_r3_svc = svc_clf_grid.predict(tfidf_train_r3)

test_pred_r3_svc = svc_clf_grid.predict(tfidf_test_r3)

In [265]:
f1_train_r3_svc=f1_score( y_train_r3, train_pred_r3_svc, labels=None, pos_label=1, average='binary')
f1_test_r3_svc=f1_score( y_test_r3, test_pred_r3_svc, labels=None, pos_label=1, average='binary')
print(f1_train_r3_svc)
print(f1_test_r3_svc)

0.9952
0.7379679144385026


### review

In [145]:
from sklearn.model_selection import train_test_split
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(train_text_dt['review'],train_text_dt['sentiment'],test_size=0.25,random_state=123)

In [146]:
print(len(X_train_r))
print(len(X_test_r))

2250
750


In [147]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r = cv.fit_transform(X_train_r)
tdm_test_r = cv.transform(X_test_r)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r = tfidf_vectorizer.fit_transform(X_train_r)
tfidf_test_r = tfidf_vectorizer.transform(X_test_r)


In [149]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tdm_train_r,y_train_r) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r = lr_clf.predict(tdm_train_r)

#predicting on test data
test_pred_r = lr_clf.predict(tdm_test_r)

In [150]:
print("train cfm :","\n", confusion_matrix(y_train_r,train_pred_r))
print("train cfm :","\n", confusion_matrix(y_test_r,test_pred_r))

train cfm : 
 [[1610   16]
 [ 105  519]]
train cfm : 
 [[511  40]
 [ 87 112]]


In [152]:
f1_train_r_log=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_log=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_log)
print(f1_test_r_log)

0.8955996548748921
0.6381766381766382


#### tfidf

In [172]:
from sklearn.model_selection import train_test_split
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(train_text_dt['review'],train_text_dt['sentiment'],test_size=0.25,random_state=123)

In [173]:
print(len(X_train_r))
print(len(X_test_r))

2250
750


In [174]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english",strip_accents="unicode",decode_error="ignore")
tdm_train_r = cv.fit_transform(X_train_r)
tdm_test_r = cv.transform(X_test_r)

##########################################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train_r = tfidf_vectorizer.fit_transform(X_train_r)
tfidf_test_r = tfidf_vectorizer.transform(X_test_r)


In [175]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

#training the model
logreg = LogisticRegression()
lr_clf = logreg.fit(tfidf_train_r,y_train_r) #or mat which is in dense format can also be used

#prediction on train data
train_pred_r = lr_clf.predict(tfidf_train_r)

#predicting on test data
test_pred_r = lr_clf.predict(tfidf_test_r)

In [176]:
print("train cfm :","\n", confusion_matrix(y_train_r,train_pred_r))
print("train cfm :","\n", confusion_matrix(y_test_r,test_pred_r))

train cfm : 
 [[1622    4]
 [ 259  365]]
train cfm : 
 [[537  14]
 [132  67]]


In [177]:
f1_train_r_log=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_log=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_log)
print(f1_test_r_log)

0.7351460221550856
0.47857142857142865


#### random forest

In [189]:
#build a random forest classifiers
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()


In [190]:
rf_clf = rf.fit(tdm_train_r,y_train_r)
#prediction on train data
train_pred_r = rf_clf.predict(tdm_train_r)

#predicting on test data
test_pred_r = rf_clf.predict(tdm_test_r)

In [191]:
f1_train_r_rf=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_rf=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_rf)
print(f1_test_r_rf)

0.9617940199335548
0.47297297297297297


#### svm

In [195]:
## Build a SVM Classifier
from sklearn.svm import SVC

## Create an SVC object and print it to see the default arguments
svc = SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [196]:
svc_c10_rbf = SVC(C=10,kernel='rbf')
svc_c10_rbf

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [197]:
## Fit the model svc_c10_rbf on the train data (X_train,y_train)
svc_c10_rbf.fit(X = tdm_train_r,y = y_train_r)

#prediction on train data
train_pred_r = svc_c10_rbf.predict(tdm_train_r)

#predicting on test data
test_pred_r = svc_c10_rbf.predict(tdm_test_r)

In [198]:
f1_train_r_svc=f1_score( y_train_r, train_pred_r, labels=None, pos_label=1, average='binary')
f1_test_r_svc=f1_score( y_test_r, test_pred_r, labels=None, pos_label=1, average='binary')
print(f1_train_r_svc)
print(f1_test_r_svc)

0.16593886462882096
0.12962962962962962
