In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud, STOPWORDS
import re, string, unicodedata
from string import punctuation

# import warnings
# warnings.filterwarnings('ignore')

## Data Loading

In [2]:
df = pd.read_csv(r'G:\ML projects\IMDB-Dataset.csv', encoding = 'latin-1')

## Data Cleaning And Preprocessing

In [3]:
stop_words = stopwords.words('english')
new_stopwords = ["would", "shall", "could", "might", "film", "movie", "director", "scene", "character", "actor", "actress"]
stop_words.extend(new_stopwords)

negations_and_sentiment_words = ["not", "no", "never", "n't", "none", "good", "bad", "love", "hate"]

for word in negations_and_sentiment_words:
    if word in stop_words:
        stop_words.remove(word)

stop_words = set(stop_words)
print(stop_words)

{'own', 'themselves', 'by', 'actress', 'what', 'to', 'hadn', 'aren', 'hasn', "doesn't", 'off', 'with', 'o', 'would', 'was', 'didn', 'who', 'is', 'other', 'before', 'shan', 'wouldn', 'haven', 'yourselves', 'at', 'be', "you've", 'our', 'he', "haven't", 'until', 'such', "you'll", 'has', 'so', 'isn', 'having', 'couldn', 'for', 'as', 'between', 'needn', 'through', 'had', 'over', 'if', "wouldn't", 'each', 'could', 'because', 'below', 'their', 'this', 'being', 'again', 'most', 'wasn', 'character', 'above', 'her', 's', 'all', "mightn't", 'herself', 'further', "don't", 'those', 'during', 'should', "hasn't", 'mustn', 'himself', 're', 'theirs', 'then', 'just', 'too', 'whom', 'your', "it's", 'into', 'y', 'doing', 'might', 'its', "she's", 'that', 'and', 'yourself', 'itself', 'some', 'doesn', 'film', 'few', 'both', "aren't", 'do', 'did', 'more', 'his', "you're", 'when', 'shouldn', "wasn't", 'll', "needn't", 'which', 've', 'against', 'a', 'about', 'have', 'while', 'once', 'd', "you'd", 'weren', 'they

In [4]:
'''-----------------------------Data Cleaning and Preprocessing pipeline----------------------------------'''

#Removing special character
def remove_special_character(content):
    # return re.sub(r'\W+',' ', content )
    return re.sub(r'\[[^&@#!]]*\]', '', content)

# Removing URL's
def remove_url(content):
    return re.sub(r'http\S+', '', content)

#Removing the stopwords from text
def remove_stopwords(content):
    clean_data = []
    for i in content.split():
        if i.strip().lower() not in stop_words and i.strip().lower().isalpha():
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

# Expansion of english contractions
def contraction_expansion(content):
    content = re.sub(r"won\'t", "would not", content)
    content = re.sub(r"can\'t", "can not", content)
    content = re.sub(r"don\'t", "do not", content)
    content = re.sub(r"shouldn\'t", "should not", content)
    content = re.sub(r"needn\'t", "need not", content)
    content = re.sub(r"hasn\'t", "has not", content)
    content = re.sub(r"haven\'t", "have not", content)
    content = re.sub(r"weren\'t", "were not", content)
    content = re.sub(r"mightn\'t", "might not", content)
    content = re.sub(r"didn\'t", "did not", content)
    content = re.sub(r"n\'t", " not", content)
    '''content = re.sub(r"\'re", " are", content)
    content = re.sub(r"\'s", " is", content)
    content = re.sub(r"\'d", " would", content)
    content = re.sub(r"\'ll", " will", content)
    content = re.sub(r"\'t", " not", content)
    content = re.sub(r"\'ve", " have", content)
    content = re.sub(r"\'m", " am", content)'''
    return content

#Data preprocessing
def data_cleaning(content):
    content = contraction_expansion(content)
    content = remove_special_character(content)
    content = remove_url(content)
    
    content = remove_stopwords(content)    
    return content

In [5]:
pd.options.display.max_colwidth = 1000

#Data cleaning
df['Reviews_clean']=df['Reviews'].apply(data_cleaning)
df.head(5)

Unnamed: 0,Ratings,Reviews,Movies,Resenhas,Reviews_clean
0,1.0,"*Disclaimer: I only watched this movie as a conditional agreement. And I see films for free. I wouldn't be caught dead giving my hard earned money to these idiots.Well, to explain the depth of this 'film', I could write my shortest review, ever. Don't see this movie. It is by far the stupidest, lamest, most lazy, and unbelievably UNFUNNY movie I have ever seen. It is a total disaster. But since my hatred for this movie, and the others like it, extends far beyond one viewing, I think I'll go on for a bit.I don't know any of the people in the movie besides Carmen Electra, Vanessa Minnillo, and Kim Kardashian, but it doesn't matter. They're all horrible, though I think that was the point. The editing is flat out horrible, and possibly blatant continuity errors make this crapfast even crappier than I thought it would be. Now I know that these films are not supposed to be serious at all, but come on, it's film-making 101 that if someone gets a minor facial cut, it should be there in the...",Disaster Movie,"* IsenÃ§Ã£o de responsabilidade: eu sÃ³ assisti esse filme como um acordo condicional. E eu vejo filmes de graÃ§a. Eu nÃ£o seria pego morto dando meu dinheiro suado a esses idiotas. Bem, para explicar a profundidade desse 'filme', eu poderia escrever minha crÃ­tica mais curta de todos os tempos. NÃ£o vÃª este filme. Ã de longe o filme mais estÃºpido, lamenta, preguiÃ§oso e inacreditavelmente UNFUNNY que eu jÃ¡ vi. Ã um desastre total. Mas como o meu Ã³dio por este filme e por outros, se estende muito alÃ©m de uma exibiÃ§Ã£o, acho que vou continuar um pouco. NÃ£o conheÃ§o nenhuma das pessoas do filme alÃ©m de Carmen Electra, Vanessa Minnillo, e Kim Kardashian, mas isso nÃ£o importa. Eles sÃ£o todos horrÃ­veis, embora eu ache que esse seja o ponto. A ediÃ§Ã£o Ã© horrÃ­vel e, possivelmente, erros de continuidade flagrantes tornam essa porcaria ainda mais horrÃ­vel do que eu pensava. Agora eu sei que esses filmes nÃ£o devem ser sÃ©rios, mas vamos lÃ¡, Ã© o cinema 101 que se alguÃ©m f...",watched conditional see films not caught dead giving hard earned money explain depth write shortest not see far unbelievably unfunny ever total since hatred others like extends far beyond one think go not know people besides carmen vanessa kim not though think editing flat possibly blatant continuity errors make crapfast even crappier thought know films not supposed serious come someone gets minor facial next someone gets cut blood least cut since narnia films away give disaster pass thoughtless mindless physical gags obviously take popular movies last year late including best picture know saddest thing stupid movies not care much money many cameos sorry ass excuses films taking away jobs directors truly deserve thought better taste ashamed making kind jason friedberg aaron burn guys contributing decline western cause downfall western
1,1.0,"I am writing this in hopes that this gets put over the previous review of this ""film"". How anyone can find this slop entertaining is completely beyond me. First of all a spoof film entitled ""Disaster Movie"", should indeed be a spoof on disaster films. Now I have seen 1 (yes count them, 1) disaster film being spoofed, that being ""Twister"". How does Juno, Iron Man, Batman, The Hulk, Alvin and the Chipmunks, Amy Winehouse, or Hancock register as Disaster films? Selzterwater and Failburg once again have shown that they lack any sort of writing skill and humor. Having unfortunately been tortured with Date Movie and Epic Movie I know exactly what to expect from these two...no plot, no jokes just bad references and cheaply remade scenes from other films. Someone should have informed them that satire is more than just copy and paste from one film to another, though I shouldn't say that because some of these actually just seem to be taken from trailers.There is nothing clever or witty or re...",Disaster Movie,"Estou escrevendo isso na esperanÃ§a de que isso seja colocado sobre a revisÃ£o anterior deste ""filme"". Como alguÃ©m pode achar divertido esse desleixo estÃ¡ completamente alÃ©m de mim. Antes de mais nada, um filme de parÃ³dia intitulado ""Filme de desastre"" deveria ser, de fato, uma parÃ³dia de filmes de desastre. Agora eu jÃ¡ vi 1 (sim, conte-os, 1) filme de desastre sendo falsificado, sendo ""Twister"". Como Juno, Homem de Ferro, Batman, O Hulk, Alvin e os Esquilos, Amy Winehouse ou Hancock se registram como filmes de Desastre? Selzterwater e Failburg mostraram mais uma vez que nÃ£o possuem nenhum tipo de habilidade e humor de escrita. Infelizmente, tendo sido torturado com Date Movie e Epic Movie, sei exatamente o que esperar desses dois ... nenhum enredo, nenhuma piada, apenas mÃ¡s referÃªncias e cenas refeitas de outros filmes. AlguÃ©m deveria ter informado a eles que a sÃ¡tira Ã© mais do que apenas copiar e colar de um filme para outro, embora eu nÃ£o deva dizer isso porque algu...",writing hopes gets put previous review anyone find slop entertaining completely beyond first spoof entitled indeed spoof disaster seen count disaster iron alvin amy hancock register disaster selzterwater failburg shown lack sort writing skill unfortunately tortured date epic know exactly expect no jokes bad references cheaply remade scenes someone informed satire copy paste one though not say actually seem taken nothing clever witty remotely smart way two not believe people still pay see insult though enjoy films doubt smart enough realize unfortunately not number low enough includes rate deserves top worst films right date epic mean meet rather forced hour hands marathon watch
2,1.0,"Really, I could write a scathing review of this turd sandwich, but instead, I'm just going to be making a few observations and points I've deduced.There's just no point in watching these movies anymore. Does any reader out there remember Scary Movie? Remember how it was original with a few comedic elements to it? There was slapstick, some funny lines, it was a pretty forgettable comedy, but it was worth the price of admission. Well, That was the last time this premise was funny. STOP MAKING THESE MOVIES. PLEASE.I could call for a boycott of these pieces of monkey sh*t, but we all know there's going to be a line up of pre pubescent annoying little buggers, spouting crappy one liners like, ""THIS IS SPARTA!"" and, ""IM RICK JAMES BITCH"" so these movies will continue to make some form of monetary gain, considering the production value of this movie looks like it cost about 10 cents to make.Don't see this movie. Don't spend any money on it. Go home, rent Airplane, laugh your ass off, and ...",Disaster Movie,"Realmente, eu poderia escrever uma crÃ­tica contundente sobre esse sanduÃ­che de cocÃ´, mas, em vez disso, vou fazer algumas observaÃ§Ãµes e pontos que deduzi. NÃ£o hÃ¡ mais sentido assistir a esses filmes. Algum leitor por aÃ­ se lembra do filme de terror? Lembra como era original, com alguns elementos cÃ´micos? Havia palhaÃ§ada, algumas frases engraÃ§adas, era uma comÃ©dia bastante esquecÃ­vel, mas valia o preÃ§o da entrada. Bem, essa foi a Ãºltima vez que essa premissa foi engraÃ§ada. PARE DE FAZER ESTES FILMES. POR FAVOR, eu poderia pedir um boicote a esses pedaÃ§os de macaco, mas todos sabemos que haverÃ¡ uma fila de buggers irritantes e prÃ©-pubescentes, jorrando uns forros ruins como: ""ISTO Ã SPARTA!"" e ""IM RICK JAMES BITCH"", para que esses filmes continuem gerando algum ganho monetÃ¡rio, considerando que o valor de produÃ§Ã£o deste filme parece custar cerca de 10 centavos de dÃ³lar. NÃ£o gaste dinheiro com isso. VÃ¡ para casa, alugue a Airplane, ria e julgue silenciosament...",write scathing review turd going making observations points no point watching movies reader remember scary remember original comedic elements funny pretty forgettable worth price last time premise stop making call boycott pieces monkey know going line pre pubescent annoying little spouting crappy one liners rick james movies continue make form monetary considering production value looks like cost cents not see not spend money go rent laugh ass silently judge people talking
3,1.0,"If you saw the other previous spoof movies by these two horrible gentlemen, then you should know that this already will be bad. I'll tell you the truth, if you want to watch it as a brainless person (ironically meant for the stereotypical teenagers, which I am not) then you will laugh at it a bit. But if you judge it, even a little, the movie automatically fails. Why? Never ask that when it comes to these two men.Remember the good old Hollywood days whenever making a movie was about showing people a type of art, and also a story that kept you on the edge of your seat? Well whenever word hit that making films earned you loads of cash, then all these greedy people came in the picture and its quite pathetic. These two are no exception. We still have movie artists (most notably the genius that is Christopher Nolan). But these two guys just...well I've been writing so big words, let me put it in simple terms for these guys...These guys suck, they are not artists, but instead money cravi...",Disaster Movie,"Se vocÃª viu os outros filmes falsificados anteriores por esses dois senhores horrÃ­veis, deve saber que isso jÃ¡ serÃ¡ ruim. Vou lhe dizer a verdade, se vocÃª quiser vÃª-lo como uma pessoa sem cÃ©rebro (ironicamente para os adolescentes estereotipados, o que eu nÃ£o sou), entÃ£o vocÃª rirÃ¡ um pouco. Mas se vocÃª julgar, mesmo que um pouco, o filme falha automaticamente. Por quÃª? Nunca pergunte isso quando se trata desses dois homens. Lembre-se dos bons e velhos tempos de Hollywood, sempre que fazer um filme era mostrar Ã s pessoas um tipo de arte e tambÃ©m uma histÃ³ria que o mantinha na ponta do seu assento? Bem, sempre que a notÃ­cia de que fazer filmes ganhava muito dinheiro, entÃ£o todas essas pessoas gananciosas apareciam na imagem e Ã© bastante patÃ©tico. Esses dois nÃ£o sÃ£o exceÃ§Ã£o. Ainda temos artistas de filmes (principalmente o gÃªnio Christopher Nolan). Mas esses dois caras simplesmente ... bem, eu tenho escrito palavras tÃ£o grandes, deixe-me colocar em termos sim...",saw previous spoof movies two horrible know already tell want watch brainless person meant stereotypical laugh judge even automatically never ask comes two good old hollywood days whenever making showing people type also story kept edge well whenever word hit making films earned loads greedy people came picture quite two no still artists notably genius christopher two guys writing big let put simple terms guys not instead money craving latest proves even fails easily mind mean nothing funny people usually put best stuff idiots knew going made bet not good idea write reviews paper tell everyone whats good whats flipped review well warning not even called nothing artistic references made throughout pretty much like hannah montana juno gig actually close spoofing failed referencing instead joking twisting random wrestling not know high respect high respect know not something not add story nudity not really needed closest still gotten idea saw bare hate girl says guys perverts brainles...
4,1.0,"This movie I saw a day early for free and I still feel like I got ripped off. It is totally brain dead. Burping, kicking in the groin and boobs all over the place. Lame. What is wrong with society, that films like this even get made? The parodies were all horrendous, and un-funny. The plot was lackluster at best and the acting was shallow, transparent and really quite unnecessary.Anyone see ""Idiocracy""? Remember the movie that won all the academy awards in the future? Well this is that movie. I have not seen a more rancid crappy film. ""Date Movie"" was okay, The Scary movies at least had decent plots, but this, this makes ""spoofs"" (if I can be so nice to call it that) for this year 0 for 3, with ""Meet the Spartans"" and ""Superhero Movie"" all falling flat.Well I've wasted even more of my life typing about this sack of cow dung. So all in all, don't see this movie, unless of course your IQ is below 80.Thanks, R",Disaster Movie,"Este filme eu vi um dia cedo de graÃ§a e ainda sinto que fui enganado. Ã totalmente morte cerebral. Arrotando, chutando a virilha e os peitos por todo o lugar. Coxo. O que hÃ¡ de errado com a sociedade, que filmes como esse sÃ£o feitos? As parÃ³dias eram todas horrendas e pouco engraÃ§adas. O enredo foi sem brilho, na melhor das hipÃ³teses, e a atuaÃ§Ã£o foi superficial, transparente e realmente bastante desnecessÃ¡ria. AlguÃ©m vÃª ""Idiocracia""? Lembra do filme que ganhou todos os prÃªmios da academia no futuro? Bem, este Ã© esse filme. Eu nÃ£o vi um filme de baixa qualidade mais ranÃ§oso. ""Date Movie"" foi bom, The Scary Movies pelo menos teve enredos decentes, mas isso faz ""spoofs"" (se Ã© que posso dizer assim) para este ano 0 para 3, com ""Meet the Spartans"" e ""Filme de super-herÃ³is"" todos caindo. Bem, eu perdi ainda mais da minha vida digitando sobre esse saco de esterco de vaca. EntÃ£o, apesar de tudo, nÃ£o assista a este filme, a menos que o seu QI seja inferior a 80.",saw day early free still feel like got ripped totally brain kicking groin boobs wrong films like even get parodies plot lackluster best acting transparent really quite see remember academy awards well not seen rancid crappy scary movies least decent makes nice call year falling wasted even life typing sack cow not see unless course iq r


## Feature Engineering

In [6]:
#Mapping rating data to Binary label 1 (+ve) if rating >=7 and 0 (-ve) if rating <=4 and 2 (neutral) if rating = 5 or 6
df['Label'] = df['Ratings'].apply(lambda x: '1' if x >= 7 else ('0' if x<=4 else '2'))
#Removing 
df=df[df.Label<'2']
data=df[['Reviews_clean','Label']]
print(data['Label'].value_counts())

Label
0    60000
1    60000
Name: count, dtype: int64


In [7]:
#Importing dependencies for feature engineering 
import sys
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from prettytable import PrettyTable
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Lemmatization

In [8]:
# lemmatization of word 
class LemmaTokenizer(object):
    def __init__(self):
        self.wordnetlemma = WordNetLemmatizer()
    def __call__(self, reviews):
        return [self.wordnetlemma.lemmatize(word) for word in word_tokenize(reviews)]
    
# Create an instance of the LemmaTokenizer
lemmatizer = LemmaTokenizer()

# Example text to be lemmatized
text = "The cats are sitting on the mat, and they are purring."

# Lemmatize the text
lemmatized_words = lemmatizer(text)

print(lemmatized_words)

['The', 'cat', 'are', 'sitting', 'on', 'the', 'mat', ',', 'and', 'they', 'are', 'purring', '.']


## Vectoization with Count Vectorizer and TFIDF Vectorizer with Unigram

In [9]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,1), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,1),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Importance with Logistic Regression and Count Vectorizer with unigram

In [10]:
lgr = LogisticRegression()
lgr.fit(x_train_count,y_train)
lgr.score(x_test_count,y_test)
# lgr.coef_[0]

i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+----------------+-------------------------+
|    Feature     |          Score          |
+----------------+-------------------------+
|      able      |   0.07709023350570186   |
|   absolutely   |   0.06678505080003146   |
|     across     |   -0.04506440306190827  |
|      act       |   -0.1482229481310405   |
|     acting     |   -0.1887392758100005   |
|     action     |    0.2768725718084724   |
|     actor      |   -0.11808250774131035  |
|     actual     |   -0.15493812524192066  |
|    actually    |   -0.03730191708972426  |
|      add       |   0.023182587995966515  |
|      age       |   0.028687312542960343  |
|     alien      |   -0.0806332293964949   |
|     almost     |  -0.003008910769537514  |
|     along      |   0.14057569793791497   |
|    already     |   -0.1892109608736185   |
|      also      |   0.09860928785830886   |
|    although    |    0.2156415771995825   |
|     always     |    0.2008322129077429   |
|    amazing     |    0.894058201097291    |
|    ameri

## Feature Importance with TFIDF vectorizer and Logistic Regression with Unigram 

In [11]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf,y_train)
lgr.score(x_test_tfidf,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=100:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+----------------+----------------------+
|    Feature     |        Score         |
+----------------+----------------------+
|      able      | 0.35842063620631287  |
|   absolutely   |  0.5087225899253346  |
|     across     | -0.1631248955545532  |
|      act       | -0.8925299331037131  |
|     acting     | -1.4311756298938583  |
|     action     |  2.319074213117472   |
|     actor      | -0.9195333017801625  |
|     actual     | -0.8363242126098487  |
|    actually    | -0.2583581910095917  |
|      add       | 0.21574622175361768  |
|      age       |  0.5153412953996935  |
|     alien      | -0.3244354193001671  |
|     almost     | -0.14087562753122349 |
|     along      |  0.8375102872868834  |
|    already     |  -1.032708658415962  |
|      also      |  1.183776073315821   |
|    although    |  1.2547298334042796  |
|     always     |  1.5446330697936599  |
|    amazing     |  4.261445320469917   |
|    american    | 0.44700678933227855  |
|     amount     | 0.3932323986161

## Vectorization with Count Vectorizer and TDIDF Vectorizer with Bigram

In [12]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(2,2), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(2,2),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Importance with Logistic Regression and Count Vectorizer with Bigram

In [13]:
lgr = LogisticRegression()
lgr.fit(x_train_count,y_train)
lgr.score(x_test_count,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+--------------------+-----------------------+
|      Feature       |         Score         |
+--------------------+-----------------------+
|   absolutely no    |  -1.1979834419894124  |
| absolutely nothing |  -1.4167266989551655  |
|   academy award    |   0.6762657014631528  |
|      act like      |  -0.5282657457961595  |
|    acting good     |   0.6829103622219064  |
|     acting not     |  -0.4185330009790828  |
|    action movie    |   0.6067030460399282  |
|     action not     |  0.38869316652441727  |
|    action scene    |  0.30352022624468455  |
|  action sequence   |  0.25161120502280004  |
|     actor not      |  -0.5392430444828252  |
|    actually not    |  -0.08186672632742602 |
|  actually pretty   |  0.14213088006502644  |
|   actually quite   |  0.10610326089306549  |
|       ai not       |   0.1001190011252298  |
|    almost every    |  -0.06793994770085895 |
|     also good      |   1.0791114423896886  |
|     also great     |   1.5451479863641149  |
|      also n

## Feature Importance with Logistic Regression and TFIDF Vectorizer with Bigram

In [14]:
lgr.fit(x_train_tfidf,y_train)
lgr.score(x_test_tfidf,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=50:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+--------------------+----------------------+
|      Feature       |        Score         |
+--------------------+----------------------+
|   absolutely no    | -2.6025645912592736  |
| absolutely nothing |  -2.752526581671555  |
|   academy award    |  1.1048564051662757  |
|      act like      |  -1.256027792993397  |
|    acting good     |  1.2871045423043956  |
|     acting not     | -1.0142661778025868  |
|    action movie    |  1.2458303915628486  |
|     action not     |  0.6623296515287169  |
|    action scene    |  0.6078315552886908  |
|  action sequence   | 0.49558845837502274  |
|     actor not      |  -1.087730938157034  |
|    actually not    | -0.38129023496828995 |
|  actually pretty   |  0.2661441609416421  |
|   actually quite   |  0.2542357667065672  |
|       ai not       |  0.1452214517844817  |
|    almost every    | -0.28125378696653874 |
|     also good      |  2.041079305092779   |
|     also great     |  2.6300832283638456  |
|      also not      | -0.26090902

In [15]:
pd.options.display.max_colwidth = 2000
df[["Reviews","Ratings","Movies"]][(df['Ratings']>=9)&(df['Reviews_clean'].str.contains("bad review | bad thing"))].head(50)

Unnamed: 0,Reviews,Ratings,Movies
120150,"How could it be the comic when much of Steel's early comic life was tied to Superman. Steel was one of the four possible Supermen who appeared right after Superman was killed by Doomsday in the comic book. There is no Superman, no Doomsday, almost nothing from the DC universe in the film, except the character itself.This is not a bad thing especially since the Steel comic could have been created with out having come from Superman. The plot line of where and how Steel comes to be is a close approximation of what a Superman-less origin might have been.I don't think its a movie that wants to be any more than it is which is a fun little romp in a super hero sort of way. Shaq is fine as Steel which he plays to the best of his ability. Allowing that the character never had much depth the central performance is everything it should be. The rest of the cast is a great deal of fun as they chew scenery and try not to take anything too seriously.So sue me I like this movie. Its a fun rainy day popcorn movie of the highest order. It had no calories and nothing in it will stick anywhere in your head.Try it. Its mindless fun.",9.0,Steel
120298,"Be warned, the start of this film is good and thats a sign bad things to come. The acting is sooooooooooooooooo terrible, the direction rubbish, the story just mental and the overall product so bad, its great. Its one of those films you watch while shaking your head saying "" did they just say that, do that ""? The Cloth is the type of film you put on when home with drunken mates and sit and laugh at, from start to finish.Its clear that whole scenes have gone missing by the editing, which is terrible. The film was partly funded by a born again Christian church ( check the credits) and it's clearly an exercise in trying to brainwash the dumb public into becoming Christians. Which it fails at on every level.The young lead I think may be a plank, thats been dressed in clothing and the only openly evil character is a young black male with lesbian friends who all do drugs....we getting the feel yet ?The best line in the film, which for me sums it up, is spoken by what i can only explain as the ""Q""of the exorcism world. After deciding he wants nothing to do with ""The Cloth"", the young lead storms out, throwing his outfit and cross onto the table. ""Q"" says shocked, ""Who throws Jesus onto a table like that "" ???Classic Turkey.",9.0,The Cloth
120485,"I really like this movie. I see some stupid people give bad review for this movie.\nI am sure they don't know bengali language. \nmy question for them, if you cannot realize whole matter then why you need to talk about it???\nIf they can understand movie language then they don't put this bad ratting",9.0,Ludo
120562,"I gotta admit I was as annoyed with the ending as most of the low star reviewers at first. Then I slept on it and woke up still thinking about this movie. I realized that the ending just fit the overall atmosphere of the story. There is SO much that is left unsaid and it dawned on me that it is by design. That was the whole point. We simply don't know for sure and probably never will, unless there is a sequel. I've decided I'm okay with this and it's made me apreciate the movie in a whole new way.I have my own theory as to what happened. If you pay attention, there are quite a few clues throughout the movie that point to a very likely conclusion. I hate that I will never be certain, but there are a million other movies I can watch if I crave that kind of closure. This is clearly not one of those movies. It made me think outside the cookie cutter thriller/horror box. It made me question my expectations. It made me realize that the reason we get upset by these frustrating, anti-climatic endings is because of our innate need to control and correct imbalance and injustice. Sometimes life is just unfair, bad things happen and we'll never know why. Give it a chance. The movie is decent. Not a masterpiece by any stretch, but well worth a watch.",9.0,The Open House
120638,"Let me be the first to say that my knowledge of movie making is very limited, but I do know what I like. I thought the movie was good. There were both good things and bad things about the movie, but overall I liked it, and would pay to see it again. I think the plot was solid, and easy to follow, and never lost my interest. There were a couple of scenes that were somewhat predictable, but allowed the viewer to just enjoy the movie rather than have to analyze every little detail. The cinematography was excellent. The multiple camera angles, and the split screens with the animations were awesome. The shots from the helicopters were very cool. The editing was fabulous. The locations and scenery were better than most films. I would hate to see how much all of the permits cost to shoot on the public streets, especially the Vegas scenes! The car chase scenes actually had a reason for being there, they weren't there just for added action. I personally think the stunt driving, car chasing/racing scenes were better than a lot of blockbuster movies such as the remake of ""Gone in Sixty Seconds"" or either ""Fast and the Furious"" movies. For the most part the acting was excellent. I thought the lead male role (the soldier) could have been better cast. While his fight scenes were very well choreographed and believable to watch, he didn't look comfortable in the speaking parts. All the other casting and acting was great. Eddie Griffin supplied the perfect amount of humor to keep the audience entertained. The women, OH MY GOD, all of them, in every shot! Speechless! If I had seen this movie with my girlfriend, I would have been in trouble for having my tongue hang out of my mouth for an hour and a half! I have never seen so many beautiful women in one movie! Or so many beautiful cars! The balance between awesome cars and awesome women was unbelievable. For an independent film I would give this an A+. This was by far the most entertaining, edge of your seat, car racing film I hav...",9.0,Redline
120664,"I am 38 and laughed quite hysterically at this movie. We just returned from Sundance and I saw it with a crew of girlfriends whom I've know for 25 years. I would say the chicks will like it more than the guys since just about every bad thing that could happen to a girl happened to Rebecca (McCarthy) in this movie. I would love to know what others think.Jenny McCarthy did some great acting. I could feel her pain. I loved the music and plot of this movieWe stayed for the Q&A after the premiere and Jenny McCarthy and John Asher seemed genuinely thrilled and over the moon about the audience reaction. They were stoked and I could tell they were proud of their work.The performances by Eddie Kaye Thomas, Carmen Electra and Kam were fantastic.",9.0,Dirty Love
120790,"wow buddy that was crazy man. I never seen something so sick. this movie I don't know is a comedy is it action mystery whatever it works. It has it all women,guns, scorpians drugs. it wasn't to long because i get pretty bored some times and i was really bored but damn i really liked the oldest actor loggia man he hasn't had a role this good since frank in scarface he is crazy man. actually i forgot about independence day he was pretty good in that, but this is by far his best stuff, I would compare this movie with like say pulpfiction or very bad things or lost boys with a dash of stand by me. i want to see his other movie now.",9.0,Wild Seven
120997,"AfterDeath- is an intelligent existential twist on a Horror film. Relying less on ""Jump Scares"" it packs a more psychological punch.5 people wake up in the middle of nowhere realizing that they are unable leave - and that this is all some sort of punishment for the lives they have lead. At first it seems like not much of a punishment- but when 5 ""bad"" people are trapped together BAD things start to happen. The cast is talented, pretty and fun to look at, but it is the ugliness of their actions that keeps you watching.I found myself still thinking about some of the questions brought up in the film, days after.",9.0,AfterDeath
121046,"I agree with madhu m on some of her remarks about this movie but honestly speaking it was a pretty interesting movie.I watched it today and did not have to kick my butt.Why i found it to be good?well because for one thing these are the types of movies u see nowadays and classics like the ten commandments are simply gone.so we are left to judge upon these movies and so we have to pick some as our favourites.i would certainly not want to spoil it for anyone.go watch it becuz it will be u who had to decide whether the movie is good or not.simply agreeing to a bad review isnt going to justify your reasoning that it is bad.yes i agree that the scenes of murder of people in the prison were not at all SCARY but they still managed to give u the sence of isolated fear u would feel when u will be trapped in a prison in the middle of antactic being chased by a hungry snake who loves u just like a customer at mcdonalds loves his BigMac.so watch it and if u want to,enjoy it.9/10",9.0,New Alcatraz
121244,"Skimming over the past few reviews all I have seen is bad review after bad review. I get so sick of people harping on remakes and new additions as if the new movie takes anything away from the old movie. That's absolutely ridiculous. Even if the remake/addition is terrible they are completely separate movies. They may share a title and some common characters but that is it. The original will still be the same regardless of what comes after it.Either way, this movie is fantastic. I went in with high expectations because I was thrilled that the franchise wasn't dead and it didn't disappoint. A brand new spin on things with all the gore you could ask for. Anyone who says this movie is no good is either too dumb to understand why it played out as it did or still suckling on the teet of the theory that the remakes/sequels should follow the exact same path that a completely different director/set of writers/cast/etc. took almost 40 years ago. There is no bigger fan/critic of TCM movies and I loved this movie. The only reason it didn't get 10 out of 10 from me is because of the short length. But maybe I'm just upset that I didn't get to watch Heather for longer...lol GD movie communists.",9.0,Texas Chainsaw


## Vectorization with Count Vectorizer and TFIDF Vectorizer with Trigram

In [16]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(3,3), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(3,3),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Importance with Logistic Regression and Count Vectorizer with Trigram

In [17]:
lgr = LogisticRegression()
lgr.fit(x_train_count,y_train)
lgr.score(x_test_count,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+--------------------------+-----------------------+
|         Feature          |         Score         |
+--------------------------+-----------------------+
|   absolutely no sense    |  -0.5398273134314296  |
|      acting not bad      |  -0.3832564325305681  |
|     acting not even      |   -1.688410137034301  |
|     acting not good      |  -1.4516398695357513  |
|    acting pretty good    |   0.6166665786649672  |
|  acting special effect   |  -0.4812949472839416  |
|     action scene not     |  -0.2891222653460352  |
|    action take place     |  0.14178574570383395  |
|      actor good job      |   0.7863858489848093  |
|     actually not bad     |  -0.5421904230996623  |
|   actually pretty good   |   0.3840414642625433  |
|     almost feel like     |   0.5683406461604616  |
|       bad bad bad        |  -1.7520545609887248  |
|       bad guy not        |   0.1599601642414163  |
|       bad not even       |  -1.8213014152533684  |
|    bad special effect    |  -0.7780925646687

## Feature Importance with Logistic Regression and TFIDF Vectorizer with Trigram

In [18]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf,y_train)
lgr.score(x_test_tfidf,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+--------------------------+------------------------+
|         Feature          |         Score          |
+--------------------------+------------------------+
|   absolutely no sense    |  -0.8983930486732936   |
|      acting not bad      |  -0.4757114186467219   |
|     acting not even      |  -1.8318210187346198   |
|     acting not good      |   -1.482964421150068   |
|    acting pretty good    |   0.719803262443405    |
|  acting special effect   |  -0.5832697649377273   |
|     action scene not     |  -0.2771986692437489   |
|    action take place     |  0.11014367284526874   |
|      actor good job      |   0.7842112998921575   |
|     actually not bad     |  -0.5406986691792346   |
|   actually pretty good   |   0.3764365560996298   |
|     almost feel like     |   0.5601572775882571   |
|       bad bad bad        |  -2.8537872322622344   |
|       bad guy not        |   0.2275448699467503   |
|       bad not even       |  -2.3299646598174784   |
|    bad special effect    |

## Vectorization with Count Vectorizer and TDIDF Vectorizer with 4-gram

In [19]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(4,4), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(4,4),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Importance with Logistic Regression and Count Vectorizer with 4-gram

In [20]:
lgr = LogisticRegression()
lgr.fit(x_train_count,y_train)
lgr.score(x_test_count,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+------------------------------------------------+-----------------------+
|                    Feature                     |         Score         |
+------------------------------------------------+-----------------------+
|            actually soul better lot            |   0.2651675699529567  |
|     addictive consistently compelling show     |   0.3574180466279323  |
|        adjective describe good writing         |   0.2651675699529567  |
|                 ai not gon na                  |  -0.6295709994408026  |
|             also great also great              |   0.7948939207575981  |
|      always natural layered demonstrates       |   0.2651675699529567  |
|       annoying illogical decision making       |  -0.3578339391204768  |
|         another seen hence reiteration         |  -0.3948121149298373  |
|   appreciation frequent personal experience    |  -0.3841056387732617  |
|          atmosphere due excessive lot          |  -0.32119784168075644 |
|         atmospheric bea

## Feature Importance with Logistic Regression and TDIDF Vectorizer with 4-gram

In [21]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf,y_train)
lgr.score(x_test_tfidf,y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i=i+1
print(importantfeature)

+------------------------------------------------+-----------------------+
|                    Feature                     |         Score         |
+------------------------------------------------+-----------------------+
|            actually soul better lot            |   0.1709286387623754  |
|     addictive consistently compelling show     |  0.24945208339817979  |
|        adjective describe good writing         |   0.1709286387623754  |
|                 ai not gon na                  |  -0.6629029474969075  |
|             also great also great              |   0.8772101523155988  |
|      always natural layered demonstrates       |   0.1709286387623754  |
|       annoying illogical decision making       |  -0.22965282508956633 |
|         another seen hence reiteration         |  -0.2627021321659269  |
|   appreciation frequent personal experience    |  -0.31142313581517633 |
|          atmosphere due excessive lot          |  -0.20688435847117187 |
|         atmospheric bea

## Vectorization with Count Vectorizer and TDIDF Vectorizer with unigram, bigram and trigram

In case of 4-grams the coefficient score is decreasing. Hence, removing it from our consideration

In [22]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,3), min_df=10,max_features=5000)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,3),min_df=10,max_features=5000)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Selection with Chi squared

In [30]:
from sklearn.feature_selection import chi2
import numpy as np

N = 5000
Number = 1
featureselection = PrettyTable(["Unigram", "Bigram","Trigram"])

for category in train['Label'].unique():
    features_chi2 = chi2(x_train_tfidf, train['Label'])
    chi2score = features_chi2[0]
    # print(chi2score)

    scores = list(zip(tfidfvect.get_feature_names_out(), chi2score))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    # print(category)
    # print(sorted_scores[:20])
    # print('\n\n')

    # feature_names = tfidfvect.get_feature_names_out()
    # top_feature_names = [feature_names[i] for i in chi2score.argsort()]
    
    top_feature_names = [ngram for ngram, score in sorted_scores]

    # indices = np.argsort(features_chi2[0])
    # feature_names = np.array(tfidfvect.get_feature_names_out())[indices]
    # print(f"Top Features for Category {category}: {feature_names[:10]}")
    # print(feature_names)

    unigrams = [x for x in top_feature_names if len(x.split(' ')) == 1]
    bigrams = [x for x in top_feature_names if len(x.split(' ')) == 2]
    trigrams = [x for x in top_feature_names if len(x.split(' ')) == 3]
    print("%s. %s :" % (Number,category))
    print("\t# Unigrams :\n\t. %s" %('\n\t. '.join(unigrams[:N])))
    print("\t# Bigrams :\n\t. %s" %('\n\t. '.join(bigrams[:N])))
    print("\t# Trigrams :\n\t. %s" %('\n\t. '.join(trigrams[:N])))
    Number += 1

1. 1 :
	# Unigrams :
	. worst
	. great
	. waste
	. bad
	. loved
	. terrible
	. enjoyed
	. awful
	. poor
	. nothing
	. worse
	. poorly
	. excellent
	. best
	. boring
	. perfect
	. no
	. love
	. well
	. amazing
	. wasted
	. minute
	. money
	. definitely
	. horrible
	. even
	. liked
	. fun
	. wonderful
	. redeeming
	. badly
	. stupid
	. action
	. script
	. superb
	. fantastic
	. awesome
	. lame
	. enjoy
	. critic
	. pleasantly
	. fails
	. avoid
	. save
	. favorite
	. brilliant
	. cheap
	. enjoyable
	. attempt
	. highly
	. annoying
	. job
	. least
	. beautiful
	. edge
	. performance
	. refreshing
	. pathetic
	. unfunny
	. supposed
	. unless
	. hilarious
	. different
	. solid
	. pointless
	. zero
	. family
	. garbage
	. beautifully
	. mess
	. dull
	. see
	. painful
	. acting
	. unique
	. negative
	. laughable
	. entertaining
	. outstanding
	. pile
	. skip
	. excuse
	. also
	. surprised
	. wasting
	. mediocre
	. unfortunately
	. underrated
	. crap
	. remotely
	. ridiculous
	. intense
	. flat

In [31]:
features_chi2 = chi2(x_train_tfidf, train['Label'])
chi2score = features_chi2[0]

scores = list(zip(tfidfvect.get_feature_names_out(), chi2score))
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
top_feature_names = [ngram for ngram, score in sorted_scores]

unigrams = [x for x in top_feature_names if len(x.split(' ')) == 1]
bigrams = [x for x in top_feature_names if len(x.split(' ')) == 2]
trigrams = [x for x in top_feature_names if len(x.split(' ')) == 3]

print("\t# Unigrams :\n\t. %s" %('\n\t. '.join(unigrams[:N])))
print("\t# Bigrams :\n\t. %s" %('\n\t. '.join(bigrams[:N])))
print("\t# Trigrams :\n\t. %s" %('\n\t. '.join(trigrams[:N])))

	# Unigrams :
	. worst
	. great
	. waste
	. bad
	. loved
	. terrible
	. enjoyed
	. awful
	. poor
	. nothing
	. worse
	. poorly
	. excellent
	. best
	. boring
	. perfect
	. no
	. love
	. well
	. amazing
	. wasted
	. minute
	. money
	. definitely
	. horrible
	. even
	. liked
	. fun
	. wonderful
	. redeeming
	. badly
	. stupid
	. action
	. script
	. superb
	. fantastic
	. awesome
	. lame
	. enjoy
	. critic
	. pleasantly
	. fails
	. avoid
	. save
	. favorite
	. brilliant
	. cheap
	. enjoyable
	. attempt
	. highly
	. annoying
	. job
	. least
	. beautiful
	. edge
	. performance
	. refreshing
	. pathetic
	. unfunny
	. supposed
	. unless
	. hilarious
	. different
	. solid
	. pointless
	. zero
	. family
	. garbage
	. beautifully
	. mess
	. dull
	. see
	. painful
	. acting
	. unique
	. negative
	. laughable
	. entertaining
	. outstanding
	. pile
	. skip
	. excuse
	. also
	. surprised
	. wasting
	. mediocre
	. unfortunately
	. underrated
	. crap
	. remotely
	. ridiculous
	. intense
	. flat
	. bar