In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud, STOPWORDS
import re, string, unicodedata
from string import punctuation

# import warnings
# warnings.filterwarnings('ignore')

## Data Loading

In [2]:
df = pd.read_csv(r'G:\ML projects\IMDB-Dataset.csv', encoding = 'latin-1')

## Data Cleaning And Preprocessing

In [3]:
stop_words = stopwords.words('english')
new_stopwords = ["would", "shall", "could", "might", "film", "movie", "director", "scene", "character", "actor", "actress"]
stop_words.extend(new_stopwords)

negations_and_sentiment_words = ["not", "no", "never", "n't", "none", "good", "bad", "love", "hate"]

for word in negations_and_sentiment_words:
    if word in stop_words:
        stop_words.remove(word)

stop_words = set(stop_words)
print(stop_words)

{'his', 'again', "it's", 'yours', 'through', 'over', 's', "haven't", 'you', 'most', 'then', 'so', 'while', 'how', 'hasn', 'your', "isn't", 'at', 'from', 'my', "wasn't", 'me', 'in', 'don', 'been', 'to', 'isn', 'director', 'between', 'a', 'him', 'up', 'her', 'film', 'during', 'haven', 'its', 'myself', "don't", 'other', 'yourselves', 'movie', 'mustn', 'doesn', 'herself', 'was', 'same', 'very', 'just', "won't", "you'll", 've', 'and', 'why', 'own', 'is', 'our', 'about', "she's", 'hadn', 'into', 'weren', 'or', 'their', 'm', 'whom', "hadn't", 'down', 'could', "couldn't", 'before', "needn't", 'couldn', 'wasn', 'all', 'ain', 'against', 'shouldn', 'the', 'out', 'as', "that'll", 'until', 'might', 'being', "weren't", 'if', 'only', 'than', 'having', 'were', 'are', 'which', 'has', 'he', 'had', 'aren', 'actress', 'each', 'them', 'of', 'shan', 'what', 'under', 'it', 't', 'below', 'ma', 'theirs', 'for', 'i', 'needn', 'would', "aren't", 'character', 'by', 'have', 'shall', 'this', 'd', 'who', "wouldn't",

In [4]:
'''-----------------------------Data Cleaning and Preprocessing pipeline----------------------------------'''

#Removing special character
def remove_special_character(content):
    # return re.sub(r'\W+',' ', content )
    return re.sub(r'\[[^&@#!]]*\]', '', content)

# Removing URL's
def remove_url(content):
    return re.sub(r'http\S+', '', content)

#Removing the stopwords from text
def remove_stopwords(content):
    clean_data = []
    for i in content.split():
        if i.strip().lower() not in stop_words and i.strip().lower().isalpha():
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

# Expansion of english contractions
def contraction_expansion(content):
    content = re.sub(r"won\'t", "would not", content)
    content = re.sub(r"can\'t", "can not", content)
    content = re.sub(r"don\'t", "do not", content)
    content = re.sub(r"shouldn\'t", "should not", content)
    content = re.sub(r"needn\'t", "need not", content)
    content = re.sub(r"hasn\'t", "has not", content)
    content = re.sub(r"haven\'t", "have not", content)
    content = re.sub(r"weren\'t", "were not", content)
    content = re.sub(r"mightn\'t", "might not", content)
    content = re.sub(r"didn\'t", "did not", content)
    content = re.sub(r"n\'t", " not", content)
    '''content = re.sub(r"\'re", " are", content)
    content = re.sub(r"\'s", " is", content)
    content = re.sub(r"\'d", " would", content)
    content = re.sub(r"\'ll", " will", content)
    content = re.sub(r"\'t", " not", content)
    content = re.sub(r"\'ve", " have", content)
    content = re.sub(r"\'m", " am", content)'''
    return content

#Data preprocessing
def data_cleaning(content):
    content = contraction_expansion(content)
    content = remove_special_character(content)
    content = remove_url(content)
    
    content = remove_stopwords(content)    
    return content

In [5]:
pd.options.display.max_colwidth = 1000

#Data cleaning
df['Reviews_clean']=df['Reviews'].apply(data_cleaning)
df.head(5)

Unnamed: 0,Ratings,Reviews,Movies,Resenhas,Reviews_clean
0,1.0,"*Disclaimer: I only watched this movie as a conditional agreement. And I see films for free. I wouldn't be caught dead giving my hard earned money to these idiots.Well, to explain the depth of this 'film', I could write my shortest review, ever. Don't see this movie. It is by far the stupidest, lamest, most lazy, and unbelievably UNFUNNY movie I have ever seen. It is a total disaster. But since my hatred for this movie, and the others like it, extends far beyond one viewing, I think I'll go on for a bit.I don't know any of the people in the movie besides Carmen Electra, Vanessa Minnillo, and Kim Kardashian, but it doesn't matter. They're all horrible, though I think that was the point. The editing is flat out horrible, and possibly blatant continuity errors make this crapfast even crappier than I thought it would be. Now I know that these films are not supposed to be serious at all, but come on, it's film-making 101 that if someone gets a minor facial cut, it should be there in the...",Disaster Movie,"* IsenÃ§Ã£o de responsabilidade: eu sÃ³ assisti esse filme como um acordo condicional. E eu vejo filmes de graÃ§a. Eu nÃ£o seria pego morto dando meu dinheiro suado a esses idiotas. Bem, para explicar a profundidade desse 'filme', eu poderia escrever minha crÃ­tica mais curta de todos os tempos. NÃ£o vÃª este filme. Ã de longe o filme mais estÃºpido, lamenta, preguiÃ§oso e inacreditavelmente UNFUNNY que eu jÃ¡ vi. Ã um desastre total. Mas como o meu Ã³dio por este filme e por outros, se estende muito alÃ©m de uma exibiÃ§Ã£o, acho que vou continuar um pouco. NÃ£o conheÃ§o nenhuma das pessoas do filme alÃ©m de Carmen Electra, Vanessa Minnillo, e Kim Kardashian, mas isso nÃ£o importa. Eles sÃ£o todos horrÃ­veis, embora eu ache que esse seja o ponto. A ediÃ§Ã£o Ã© horrÃ­vel e, possivelmente, erros de continuidade flagrantes tornam essa porcaria ainda mais horrÃ­vel do que eu pensava. Agora eu sei que esses filmes nÃ£o devem ser sÃ©rios, mas vamos lÃ¡, Ã© o cinema 101 que se alguÃ©m f...",watched conditional see films not caught dead giving hard earned money explain depth write shortest not see far unbelievably unfunny ever total since hatred others like extends far beyond one think go not know people besides carmen vanessa kim not though think editing flat possibly blatant continuity errors make crapfast even crappier thought know films not supposed serious come someone gets minor facial next someone gets cut blood least cut since narnia films away give disaster pass thoughtless mindless physical gags obviously take popular movies last year late including best picture know saddest thing stupid movies not care much money many cameos sorry ass excuses films taking away jobs directors truly deserve thought better taste ashamed making kind jason friedberg aaron burn guys contributing decline western cause downfall western
1,1.0,"I am writing this in hopes that this gets put over the previous review of this ""film"". How anyone can find this slop entertaining is completely beyond me. First of all a spoof film entitled ""Disaster Movie"", should indeed be a spoof on disaster films. Now I have seen 1 (yes count them, 1) disaster film being spoofed, that being ""Twister"". How does Juno, Iron Man, Batman, The Hulk, Alvin and the Chipmunks, Amy Winehouse, or Hancock register as Disaster films? Selzterwater and Failburg once again have shown that they lack any sort of writing skill and humor. Having unfortunately been tortured with Date Movie and Epic Movie I know exactly what to expect from these two...no plot, no jokes just bad references and cheaply remade scenes from other films. Someone should have informed them that satire is more than just copy and paste from one film to another, though I shouldn't say that because some of these actually just seem to be taken from trailers.There is nothing clever or witty or re...",Disaster Movie,"Estou escrevendo isso na esperanÃ§a de que isso seja colocado sobre a revisÃ£o anterior deste ""filme"". Como alguÃ©m pode achar divertido esse desleixo estÃ¡ completamente alÃ©m de mim. Antes de mais nada, um filme de parÃ³dia intitulado ""Filme de desastre"" deveria ser, de fato, uma parÃ³dia de filmes de desastre. Agora eu jÃ¡ vi 1 (sim, conte-os, 1) filme de desastre sendo falsificado, sendo ""Twister"". Como Juno, Homem de Ferro, Batman, O Hulk, Alvin e os Esquilos, Amy Winehouse ou Hancock se registram como filmes de Desastre? Selzterwater e Failburg mostraram mais uma vez que nÃ£o possuem nenhum tipo de habilidade e humor de escrita. Infelizmente, tendo sido torturado com Date Movie e Epic Movie, sei exatamente o que esperar desses dois ... nenhum enredo, nenhuma piada, apenas mÃ¡s referÃªncias e cenas refeitas de outros filmes. AlguÃ©m deveria ter informado a eles que a sÃ¡tira Ã© mais do que apenas copiar e colar de um filme para outro, embora eu nÃ£o deva dizer isso porque algu...",writing hopes gets put previous review anyone find slop entertaining completely beyond first spoof entitled indeed spoof disaster seen count disaster iron alvin amy hancock register disaster selzterwater failburg shown lack sort writing skill unfortunately tortured date epic know exactly expect no jokes bad references cheaply remade scenes someone informed satire copy paste one though not say actually seem taken nothing clever witty remotely smart way two not believe people still pay see insult though enjoy films doubt smart enough realize unfortunately not number low enough includes rate deserves top worst films right date epic mean meet rather forced hour hands marathon watch
2,1.0,"Really, I could write a scathing review of this turd sandwich, but instead, I'm just going to be making a few observations and points I've deduced.There's just no point in watching these movies anymore. Does any reader out there remember Scary Movie? Remember how it was original with a few comedic elements to it? There was slapstick, some funny lines, it was a pretty forgettable comedy, but it was worth the price of admission. Well, That was the last time this premise was funny. STOP MAKING THESE MOVIES. PLEASE.I could call for a boycott of these pieces of monkey sh*t, but we all know there's going to be a line up of pre pubescent annoying little buggers, spouting crappy one liners like, ""THIS IS SPARTA!"" and, ""IM RICK JAMES BITCH"" so these movies will continue to make some form of monetary gain, considering the production value of this movie looks like it cost about 10 cents to make.Don't see this movie. Don't spend any money on it. Go home, rent Airplane, laugh your ass off, and ...",Disaster Movie,"Realmente, eu poderia escrever uma crÃ­tica contundente sobre esse sanduÃ­che de cocÃ´, mas, em vez disso, vou fazer algumas observaÃ§Ãµes e pontos que deduzi. NÃ£o hÃ¡ mais sentido assistir a esses filmes. Algum leitor por aÃ­ se lembra do filme de terror? Lembra como era original, com alguns elementos cÃ´micos? Havia palhaÃ§ada, algumas frases engraÃ§adas, era uma comÃ©dia bastante esquecÃ­vel, mas valia o preÃ§o da entrada. Bem, essa foi a Ãºltima vez que essa premissa foi engraÃ§ada. PARE DE FAZER ESTES FILMES. POR FAVOR, eu poderia pedir um boicote a esses pedaÃ§os de macaco, mas todos sabemos que haverÃ¡ uma fila de buggers irritantes e prÃ©-pubescentes, jorrando uns forros ruins como: ""ISTO Ã SPARTA!"" e ""IM RICK JAMES BITCH"", para que esses filmes continuem gerando algum ganho monetÃ¡rio, considerando que o valor de produÃ§Ã£o deste filme parece custar cerca de 10 centavos de dÃ³lar. NÃ£o gaste dinheiro com isso. VÃ¡ para casa, alugue a Airplane, ria e julgue silenciosament...",write scathing review turd going making observations points no point watching movies reader remember scary remember original comedic elements funny pretty forgettable worth price last time premise stop making call boycott pieces monkey know going line pre pubescent annoying little spouting crappy one liners rick james movies continue make form monetary considering production value looks like cost cents not see not spend money go rent laugh ass silently judge people talking
3,1.0,"If you saw the other previous spoof movies by these two horrible gentlemen, then you should know that this already will be bad. I'll tell you the truth, if you want to watch it as a brainless person (ironically meant for the stereotypical teenagers, which I am not) then you will laugh at it a bit. But if you judge it, even a little, the movie automatically fails. Why? Never ask that when it comes to these two men.Remember the good old Hollywood days whenever making a movie was about showing people a type of art, and also a story that kept you on the edge of your seat? Well whenever word hit that making films earned you loads of cash, then all these greedy people came in the picture and its quite pathetic. These two are no exception. We still have movie artists (most notably the genius that is Christopher Nolan). But these two guys just...well I've been writing so big words, let me put it in simple terms for these guys...These guys suck, they are not artists, but instead money cravi...",Disaster Movie,"Se vocÃª viu os outros filmes falsificados anteriores por esses dois senhores horrÃ­veis, deve saber que isso jÃ¡ serÃ¡ ruim. Vou lhe dizer a verdade, se vocÃª quiser vÃª-lo como uma pessoa sem cÃ©rebro (ironicamente para os adolescentes estereotipados, o que eu nÃ£o sou), entÃ£o vocÃª rirÃ¡ um pouco. Mas se vocÃª julgar, mesmo que um pouco, o filme falha automaticamente. Por quÃª? Nunca pergunte isso quando se trata desses dois homens. Lembre-se dos bons e velhos tempos de Hollywood, sempre que fazer um filme era mostrar Ã s pessoas um tipo de arte e tambÃ©m uma histÃ³ria que o mantinha na ponta do seu assento? Bem, sempre que a notÃ­cia de que fazer filmes ganhava muito dinheiro, entÃ£o todas essas pessoas gananciosas apareciam na imagem e Ã© bastante patÃ©tico. Esses dois nÃ£o sÃ£o exceÃ§Ã£o. Ainda temos artistas de filmes (principalmente o gÃªnio Christopher Nolan). Mas esses dois caras simplesmente ... bem, eu tenho escrito palavras tÃ£o grandes, deixe-me colocar em termos sim...",saw previous spoof movies two horrible know already tell want watch brainless person meant stereotypical laugh judge even automatically never ask comes two good old hollywood days whenever making showing people type also story kept edge well whenever word hit making films earned loads greedy people came picture quite two no still artists notably genius christopher two guys writing big let put simple terms guys not instead money craving latest proves even fails easily mind mean nothing funny people usually put best stuff idiots knew going made bet not good idea write reviews paper tell everyone whats good whats flipped review well warning not even called nothing artistic references made throughout pretty much like hannah montana juno gig actually close spoofing failed referencing instead joking twisting random wrestling not know high respect high respect know not something not add story nudity not really needed closest still gotten idea saw bare hate girl says guys perverts brainles...
4,1.0,"This movie I saw a day early for free and I still feel like I got ripped off. It is totally brain dead. Burping, kicking in the groin and boobs all over the place. Lame. What is wrong with society, that films like this even get made? The parodies were all horrendous, and un-funny. The plot was lackluster at best and the acting was shallow, transparent and really quite unnecessary.Anyone see ""Idiocracy""? Remember the movie that won all the academy awards in the future? Well this is that movie. I have not seen a more rancid crappy film. ""Date Movie"" was okay, The Scary movies at least had decent plots, but this, this makes ""spoofs"" (if I can be so nice to call it that) for this year 0 for 3, with ""Meet the Spartans"" and ""Superhero Movie"" all falling flat.Well I've wasted even more of my life typing about this sack of cow dung. So all in all, don't see this movie, unless of course your IQ is below 80.Thanks, R",Disaster Movie,"Este filme eu vi um dia cedo de graÃ§a e ainda sinto que fui enganado. Ã totalmente morte cerebral. Arrotando, chutando a virilha e os peitos por todo o lugar. Coxo. O que hÃ¡ de errado com a sociedade, que filmes como esse sÃ£o feitos? As parÃ³dias eram todas horrendas e pouco engraÃ§adas. O enredo foi sem brilho, na melhor das hipÃ³teses, e a atuaÃ§Ã£o foi superficial, transparente e realmente bastante desnecessÃ¡ria. AlguÃ©m vÃª ""Idiocracia""? Lembra do filme que ganhou todos os prÃªmios da academia no futuro? Bem, este Ã© esse filme. Eu nÃ£o vi um filme de baixa qualidade mais ranÃ§oso. ""Date Movie"" foi bom, The Scary Movies pelo menos teve enredos decentes, mas isso faz ""spoofs"" (se Ã© que posso dizer assim) para este ano 0 para 3, com ""Meet the Spartans"" e ""Filme de super-herÃ³is"" todos caindo. Bem, eu perdi ainda mais da minha vida digitando sobre esse saco de esterco de vaca. EntÃ£o, apesar de tudo, nÃ£o assista a este filme, a menos que o seu QI seja inferior a 80.",saw day early free still feel like got ripped totally brain kicking groin boobs wrong films like even get parodies plot lackluster best acting transparent really quite see remember academy awards well not seen rancid crappy scary movies least decent makes nice call year falling wasted even life typing sack cow not see unless course iq r


## Feature Engineering

In [6]:
#Mapping rating data to Binary label 1 (+ve) if rating >=7 and 0 (-ve) if rating <=4 and 2 (neutral) if rating = 5 or 6
df['Label'] = df['Ratings'].apply(lambda x: '1' if x >= 7 else ('0' if x<=4 else '2'))
#Removing 
df=df[df.Label<'2']
data=df[['Reviews_clean','Label']]
print(data['Label'].value_counts())

Label
0    60000
1    60000
Name: count, dtype: int64


In [7]:
#Importing dependencies for feature engineering 
import sys
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from prettytable import PrettyTable
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Lemmatization

In [8]:
# lemmatization of word 
class LemmaTokenizer(object):
    def __init__(self):
        self.wordnetlemma = WordNetLemmatizer()
    def __call__(self, reviews):
        return [self.wordnetlemma.lemmatize(word) for word in word_tokenize(reviews)]
    
# Create an instance of the LemmaTokenizer
lemmatizer = LemmaTokenizer()

# Example text to be lemmatized
text = "The cats are sitting on the mat, and they are purring."

# Lemmatize the text
lemmatized_words = lemmatizer(text)

print(lemmatized_words)

['The', 'cat', 'are', 'sitting', 'on', 'the', 'mat', ',', 'and', 'they', 'are', 'purring', '.']


## Vectoization with TFIDF Vectorizer with Unigram, Bigram and Trigram

In [9]:
train, test = train_test_split(data, test_size=.3, random_state=42, shuffle=True)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1, 3), min_df=10, max_features=500)

x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Selection with Chi squared

In [10]:
from sklearn.feature_selection import chi2
import numpy as np

N = 5000
featureselection = PrettyTable(["Unigram", "Bigram","Trigram"])

features_chi2 = chi2(x_train_tfidf, train['Label'])
chi2score = features_chi2[0]

scores = list(zip(tfidfvect.get_feature_names_out(), chi2score))
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
top_feature_names = [ngram for ngram, score in sorted_scores]

unigrams = [x for x in top_feature_names if len(x.split(' ')) == 1]
bigrams = [x for x in top_feature_names if len(x.split(' ')) == 2]
trigrams = [x for x in top_feature_names if len(x.split(' ')) == 3]

print("\t# Unigrams :\n\t. %s" %('\n\t. '.join(unigrams[:N])))
print("\t# Bigrams :\n\t. %s" %('\n\t. '.join(bigrams[:N])))
print("\t# Trigrams :\n\t. %s" %('\n\t. '.join(trigrams[:N])))

	# Unigrams :
	. worst
	. great
	. waste
	. bad
	. loved
	. terrible
	. enjoyed
	. awful
	. poor
	. nothing
	. excellent
	. worse
	. best
	. boring
	. perfect
	. no
	. well
	. love
	. amazing
	. minute
	. definitely
	. money
	. even
	. horrible
	. fun
	. liked
	. wonderful
	. action
	. stupid
	. script
	. enjoy
	. favorite
	. enjoyable
	. cheap
	. highly
	. save
	. attempt
	. performance
	. job
	. beautiful
	. least
	. supposed
	. different
	. family
	. see
	. entertaining
	. also
	. surprised
	. acting
	. true
	. lack
	. recommend
	. role
	. heart
	. complete
	. always
	. hour
	. good
	. strong
	. bunch
	. lot
	. world
	. nice
	. none
	. classic
	. especially
	. ever
	. guess
	. instead
	. anything
	. fan
	. relationship
	. plenty
	. look
	. episode
	. comedy
	. dark
	. try
	. life
	. dialogue
	. humor
	. idea
	. comic
	. bit
	. trying
	. half
	. new
	. keep
	. twist
	. except
	. someone
	. reason
	. u
	. not
	. simple
	. title
	. decent
	. still
	. played
	. else
	. many
	. season
	.

## Model Selection

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

In [12]:
# Dictionary for mapping model and f1 score
model_to_f1_score = {}

## Training and Evaluation of Logistic Regression Model

In [13]:
model_lgr = Pipeline(steps = [("classifier", LogisticRegression())])
model_lgr.fit(x_train_tfidf, y_train)

train_prediction = model_lgr.predict(x_train_tfidf)

print("Precision Score on training dateset for Logistic Regression: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Logistic Regression: %s" % roc_auc_score(y_train, model_lgr.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_1 = f1_score(y_train, train_prediction, average="weighted")
print("F1 Score on training dateset for Logistic Regression: %s" % f1_score_train_1)
print("\n")

test_prediction = model_lgr.predict(x_test_tfidf)

print("Precision Score on test for Logistic Regression: %s" % precision_score(y_test, test_prediction, average='micro'))
print("AUC Score on test for Logistic Regression: %s" % roc_auc_score(y_test, model_lgr.predict_proba(x_test_tfidf)[:,1], multi_class='ovo',average='macro'))

f1_score_1 = f1_score(y_test, test_prediction, average="weighted")
print("F1 Score on test dataset for Logistic Regression: %s" % f1_score_1)

model_to_f1_score[model_lgr] = f1_score_1

Precision Score on training dateset for Logistic Regression: 0.8229285714285715
AUC Score on training dateset for Logistic Regression: 0.9041354610928594
F1 Score on training dateset for Logistic Regression: 0.8229197523419759


Precision Score on test for Logistic Regression: 0.8163611111111111
AUC Score on test for Logistic Regression: 0.8988959652367594
F1 Score on test dataset for Logistic Regression: 0.8163628646101859


## Training and Evaluation of Decision Tree Classifier Model

In [14]:
model_dtc = Pipeline(
    steps=[
        #("classifier", DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)),
        ("classifier", DecisionTreeClassifier())
    ]
)

model_dtc.fit(x_train_tfidf, y_train)

train_prediction = model_dtc.predict(x_train_tfidf)

print("Precision Score on training dateset for Decision Tree Classifier: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Decision Tree Classifier: %s" % roc_auc_score(y_train, model_dtc.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_2 = f1_score(y_train, train_prediction, average="weighted")
print("F1 Score training dateset for Decision Tree Classifier: %s" % f1_score_train_2)
print("\n")

test_prediction = model_dtc.predict(x_test_tfidf)

print("Precision Score on test for Decision Tree Classifier: %s" % precision_score(y_test, test_prediction, average='micro'))
print("AUC Score on test for Decision Tree Classifier: %s" % roc_auc_score(y_test, model_dtc.predict_proba(x_test_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_2 = f1_score(y_test, test_prediction, average="weighted")
print("F1 Score for Decision Tree Classifier: %s" % f1_score_2)

model_to_f1_score[model_dtc] = f1_score_2

Precision Score on training dateset for Decision Tree Classifier: 0.9994642857142857
AUC Score on training dateset for Decision Tree Classifier: 0.9999991357584328
F1 Score training dateset for Decision Tree Classifier: 0.9994642865827811


Precision Score on test for Decision Tree Classifier: 0.68475
AUC Score on test for Decision Tree Classifier: 0.6850061146806661
F1 Score for Decision Tree Classifier: 0.6847560227545438


## Decision Tree Classifier with max depth 11 to fix overfit

In [15]:
model_dtc2 = Pipeline(
    steps = [
        ("classifier", DecisionTreeClassifier( criterion='gini', max_depth=11, min_samples_split=2, min_samples_leaf=1)),
    ]
)

model_dtc2.fit(x_train_tfidf, y_train)

train_prediction = model_dtc2.predict(x_train_tfidf)

print("Precision Score on training dateset for Decision Tree Classifier: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Decision Tree Classifier: %s" % roc_auc_score(y_train, model_dtc2.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_3 = f1_score(y_train, train_prediction, average="weighted")
print("F1 Score training dateset for Decision Tree Classifier: %s" % f1_score_train_3)
print("\n")

test_prediction = model_dtc2.predict(x_test_tfidf)

print("Precision Score on test for Decision Tree Classifier: %s" % precision_score(y_test, test_prediction, average='micro'))
print("AUC Score on test for Decision Tree Classifier: %s" % roc_auc_score(y_test, model_dtc2.predict_proba(x_test_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_3 = f1_score(y_test, test_prediction, average="weighted")
print("F1 Score for Decision Tree Classifier: %s" % f1_score_3)

model_to_f1_score[model_dtc2] = f1_score_3

Precision Score on training dateset for Decision Tree Classifier: 0.7196309523809524
AUC Score on training dateset for Decision Tree Classifier: 0.8067984314284834
F1 Score training dateset for Decision Tree Classifier: 0.7148304951556239


Precision Score on test for Decision Tree Classifier: 0.6889722222222222
AUC Score on test for Decision Tree Classifier: 0.7611843173411232
F1 Score for Decision Tree Classifier: 0.6837141805715267


## Training and Evaluation of Random Forest Classifier Model

In [16]:
model_rfc = Pipeline(
    steps=[
        #("classifier", RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=15, min_samples_split=3, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None))
        ("classifier", RandomForestClassifier())
    ]
)

model_rfc.fit(x_train_tfidf, y_train)

train_prediction = model_rfc.predict(x_train_tfidf)

print("Precision Score on training dateset for Random Forest Classifier: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Random Forest Classifier: %s" % roc_auc_score(y_train, model_rfc.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_4 = f1_score(y_train, train_prediction, average="weighted")
print("F1 Score training dateset for Random Forest Classifier: %s" % f1_score_train_4)
print("\n")

test_prediction = model_rfc.predict(x_test_tfidf)

print("Precision Score on test for Random Forest Classifier: %s" % precision_score(y_test, model_rfc.predict(x_test_tfidf), average='micro'))
print("AUC Score on test for Random Forest Classifier: %s" % roc_auc_score(y_test, model_rfc.predict_proba(x_test_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_4 = f1_score(y_test,model_rfc.predict(x_test_tfidf), average="weighted")
print("F1 Score for Random Forest Classifier: %s" % f1_score_4)

model_to_f1_score[model_rfc] = f1_score_4

Precision Score on training dateset for Random Forest Classifier: 0.9994642857142857
AUC Score on training dateset for Random Forest Classifier: 0.9999766155903397
F1 Score training dateset for Random Forest Classifier: 0.99946428639449


Precision Score on test for Random Forest Classifier: 0.7876388888888889
AUC Score on test for Random Forest Classifier: 0.8717113528105923
F1 Score for Random Forest Classifier: 0.7876409166457224


## Training and Evaluation of Ada Boost (Adaptive Boost) Classifier Model

In [17]:
model_abc = Pipeline(
    steps = [
        ("classifier", AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth = 4),
        n_estimators = 100,
        learning_rate = .8)),
    ]
)

model_abc.fit(x_train_tfidf, y_train)

train_prediction = model_abc.predict(x_train_tfidf)

print("Precision Score on training dateset for Ada Boost Classifier: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Ada Boost Classifier: %s" % roc_auc_score(y_train, model_abc.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_5 = f1_score(y_train,train_prediction, average="weighted")
print("F1 Score training dateset for Ada Boost Classifier: %s" % f1_score_train_5)
print("\n")

test_prediction = model_abc.predict(x_test_tfidf)

print("Precision Score on test for Ada Boost Classifier: %s" % precision_score(y_test, test_prediction, average='micro'))
print("AUC Score on test for Ada Boost Classifier: %s" % roc_auc_score(y_test, model_abc.predict_proba(x_test_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_5 = f1_score(y_test, test_prediction, average="weighted")
print("F1 Score for Random Forest Classifier: %s" % f1_score_5)

model_to_f1_score[model_abc] = f1_score_5



Precision Score on training dateset for Ada Boost Classifier: 0.8490357142857143
AUC Score on training dateset for Ada Boost Classifier: 0.9337366712918284
F1 Score training dateset for Ada Boost Classifier: 0.8490315556668288


Precision Score on test for Ada Boost Classifier: 0.79175
AUC Score on test for Ada Boost Classifier: 0.8709099546136978
F1 Score for Random Forest Classifier: 0.7917540672059571


## Hyperparameter Tunning with Grid Search

In [18]:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection

def hyperparamtune(classifier, param_grid, metric, verbose_value, cv):
    model = model_selection.GridSearchCV(
            estimator = classifier,
            param_grid = param_grid,
            scoring = metric,
            verbose = verbose_value,            
            cv = cv)

    model.fit(x_train_tfidf, y_train)
    print("Best Score %s" % {model.best_score_})

    print("Best hyperparameter set:")
    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print(f"\t{param_name}: {best_parameters[param_name]}")

    return model, best_parameters

## Hyperparameter tunning of Logistic Regression

In [19]:
param_gd = {"penalty" : ["l2", "l1"],
         "C" : [0.01, 0.1, 1.0, 10],
         "tol" : [0.0001, 0.001, 0.01],
         "max_iter" : [100, 200]}

model_lgr_optimized, best_param = hyperparamtune(LogisticRegression(), param_gd, "accuracy", 10, 5)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 1/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.805 total time=   0.6s
[CV 2/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 2/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.807 total time=   0.7s
[CV 3/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 3/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.811 total time=   0.7s
[CV 4/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 4/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.807 total time=   0.6s
[CV 5/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 5/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.805 total time=   0.6s
[CV 1/5; 2/48] START C=0.01, max_iter=100, penal

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "g:\ML projects\sentiment_analysis\env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "g:\ML projects\sentiment_analysis\env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "g:\ML projects\sentiment_analysis\env\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             

Best Score {np.float64(0.8193095238095237)}
Best hyperparameter set:
	C: 1.0
	max_iter: 100
	penalty: l2
	tol: 0.001


## Evaluation of FineTuned Logsitic Regression Classifier

In [20]:
train_prediction = model_lgr_optimized.predict(x_train_tfidf)

print("Precision Score on training dateset for Finetuned Logsitic Regression Classifier: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Finetuned Logsitic Regression Classifier: %s" % roc_auc_score(y_train, model_lgr_optimized.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_6 = f1_score(y_train, train_prediction, average="weighted")
print("F1 Score training dateset for Finetuned Logsitic Regression Classifier: %s" % f1_score_train_6)
print("\n")

test_prediction = model_lgr_optimized.predict(x_test_tfidf)

print("Precision Score on test for Finetuned Logsitic Regression Classifier: %s" % precision_score(y_test, test_prediction, average='micro'))
print("AUC Score on test for Finetuned Logsitic Regression Classifier: %s" % roc_auc_score(y_test, model_lgr_optimized.predict_proba(x_test_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_6 = f1_score(y_test, test_prediction, average="weighted")
print("F1 Score for Finetuned Logsitic Regression Classifier: %s" % f1_score_6)

model_to_f1_score[model_lgr_optimized] = f1_score_6

Precision Score on training dateset for Finetuned Logsitic Regression Classifier: 0.8225595238095238
AUC Score on training dateset for Finetuned Logsitic Regression Classifier: 0.9034188696275864
F1 Score training dateset for Finetuned Logsitic Regression Classifier: 0.8225495091323806


Precision Score on test for Finetuned Logsitic Regression Classifier: 0.8158055555555556
AUC Score on test for Finetuned Logsitic Regression Classifier: 0.8984561249283894
F1 Score for Finetuned Logsitic Regression Classifier: 0.8158079112907944


## Selecting The Best Model

In [21]:
best_model = max(model_to_f1_score, key=model_to_f1_score.get)
best_f1_score = model_to_f1_score[best_model]

print(f"The best model is {best_model} with an F1 score of {best_f1_score}.")

The best model is Pipeline(steps=[('classifier', LogisticRegression())]) with an F1 score of 0.8163628646101859.


## Hyperparameter Tunning for Random Forest Classifier

In [None]:
param_gd = {"n_estimators" : [100, 200, 300],
         "max_depth" : [11, 13, 17, 19, 23],
         "criterion" : ["gini", "entropy"],
         "min_samples_split" : [3, 7, 11],
         "min_samples_leaf" : [3, 5],
         "max_features" : ["sqrt", "log2"]}

model_rfc_optimized, best_param = hyperparamtune(RandomForestClassifier(), param_gd, "accuracy", 10, 5)

## Taking too much time

## Evaluation of Finetuned Random Forest Classifier

In [None]:
train_prediction = model_rfc_optimized.predict(x_train_tfidf)

print("Precision Score on training dateset for Finetuned Random Forest Classifier: %s" % precision_score(y_train, train_prediction, average='micro'))
print("AUC Score on training dateset for Finetuned Random Forest Classifier: %s" % roc_auc_score(y_train, model_rfc_optimized.predict_proba(x_train_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_train_8 = f1_score(y_train, train_prediction, average="weighted")
print("F1 Score training dateset for Finetuned Random Forest Classifier: %s" % f1_score_train_8)
print("\n")

test_prediction = model_rfc_optimized.predict(x_test_tfidf)

print("Precision Score on test for Finetuned Random Forest Classifier: %s" % precision_score(y_test, test_prediction, average='micro'))
print("AUC Score on test for Finetuned Random Forest Classifier: %s" % roc_auc_score(y_test, model_rfc_optimized.predict_proba(x_test_tfidf)[:,1], multi_class='ovo', average='macro'))

f1_score_8 = f1_score(y_test, test_prediction, average="weighted")
print("F1 Score for Finetuned Random Forest Classifier: %s" % f1_score_8)

## Hyperparameter Tunning for Ada Boost Classifier

In [None]:
param_gd = {"base_estimator" : [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=4), DecisionTreeClassifier(max_depth=8), DecisionTreeClassifier(max_depth=10)],
          "learning_rate" : [0.001, 0.01, 0.1, 0.5, 0.8, 1, 2],
          "n_estimators":[50, 100, 200, 300, 500, 800]}

model_10, best_param_10 = hyperparamtune(AdaBoostClassifier(), param_gd, "accuracy", 10, 5)