<a href="https://colab.research.google.com/github/pragyasingh1729/SentimentAnalysis/blob/main/SA_FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install nltk --q
!pip install kaggle --q

In [3]:
import os
import sys
import json
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from wordcloud import WordCloud, STOPWORDS

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords

import re, string, unicodedata
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from string import punctuation
from prettytable import PrettyTable #used to create and display ASCII tables in a readable format

import warnings
warnings.filterwarnings('ignore')

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
!kaggle datasets download -d crisbam/imdb-dataset-of-65k-movie-reviews-and-translation


Dataset URL: https://www.kaggle.com/datasets/crisbam/imdb-dataset-of-65k-movie-reviews-and-translation
License(s): CC0-1.0
Downloading imdb-dataset-of-65k-movie-reviews-and-translation.zip to /content
 99% 145M/147M [00:04<00:00, 42.1MB/s]
100% 147M/147M [00:04<00:00, 33.6MB/s]


In [6]:
with ZipFile('imdb-dataset-of-65k-movie-reviews-and-translation.zip', 'r') as zipref:
  zipref.extractall()

In [7]:
df = pd.read_csv('IMDB-Dataset.csv')


## Data preprocessing

In [8]:
stop_words = stopwords.words('english')
stop_words.remove("not")

new_stopwords = ["might", "could", "one", "film", "movie", "would", "shall"]
stop_words.extend(new_stopwords)

stop_words = set(stop_words)

In [9]:
contraction_mapping = {
    "won't": "would not",
    "can't": "can not",
    "don't": "do not",
    "shouldn't": "should not",
    "needn't": "need not",
    "hasn't": "has not",
    "haven't": "have not",
    "weren't": "were not",
    "mightn't": "might not",
    "didn't": "did not"
}

In [10]:
def preprocessing_text(text):

  for contraction, expanded_form in contraction_mapping.items():
    text = re.sub(r"\b" + re.escape(contraction) + r"\b", expanded_form, text)

    text = text.lower()
    text = re.sub('<.*?>', ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub('https\S\s',  ' ', text)

    text = nltk.word_tokenize(text)

    text = [word for word in text if word not in stop_words]

    lmtr = WordNetLemmatizer()

    text = [lmtr.lemmatize(word) for word in text]

    return " ".join(text)

In [11]:
df['Clean_Review'] = df['Reviews'].apply(preprocessing_text)

In [12]:
pd.options.display.max_colwidth = 1000
df.head(3)

Unnamed: 0,Ratings,Reviews,Movies,Resenhas,Clean_Review
0,1.0,"*Disclaimer: I only watched this movie as a conditional agreement. And I see films for free. I wouldn't be caught dead giving my hard earned money to these idiots.Well, to explain the depth of this 'film', I could write my shortest review, ever. Don't see this movie. It is by far the stupidest, lamest, most lazy, and unbelievably UNFUNNY movie I have ever seen. It is a total disaster. But since my hatred for this movie, and the others like it, extends far beyond one viewing, I think I'll go on for a bit.I don't know any of the people in the movie besides Carmen Electra, Vanessa Minnillo, and Kim Kardashian, but it doesn't matter. They're all horrible, though I think that was the point. The editing is flat out horrible, and possibly blatant continuity errors make this crapfast even crappier than I thought it would be. Now I know that these films are not supposed to be serious at all, but come on, it's film-making 101 that if someone gets a minor facial cut, it should be there in the...",Disaster Movie,"* Isenção de responsabilidade: eu só assisti esse filme como um acordo condicional. E eu vejo filmes de graça. Eu não seria pego morto dando meu dinheiro suado a esses idiotas. Bem, para explicar a profundidade desse 'filme', eu poderia escrever minha crítica mais curta de todos os tempos. Não vê este filme. É de longe o filme mais estúpido, lamenta, preguiçoso e inacreditavelmente UNFUNNY que eu já vi. É um desastre total. Mas como o meu ódio por este filme e por outros, se estende muito além de uma exibição, acho que vou continuar um pouco. Não conheço nenhuma das pessoas do filme além de Carmen Electra, Vanessa Minnillo, e Kim Kardashian, mas isso não importa. Eles são todos horríveis, embora eu ache que esse seja o ponto. A edição é horrível e, possivelmente, erros de continuidade flagrantes tornam essa porcaria ainda mais horrível do que eu pensava. Agora eu sei que esses filmes não devem ser sérios, mas vamos lá, é o cinema 101 que se alguém fizer um pequeno corte facial, ele...",disclaimer watched conditional agreement see film free caught dead giving hard earned money idiot well explain depth write shortest review ever see far stupidest lamest lazy unbelievably unfunny ever seen total disaster since hatred others like extends far beyond viewing think go bit know people besides carmen electra vanessa minnillo kim kardashian matter horrible though think point editing flat horrible possibly blatant continuity error make crapfast even crappier thought know film not supposed serious come making someone get minor facial cut next shot someone get cut sword blood least cut though since narnia film get away give disaster pas joke thoughtless mindless physical gag obviously take popular movie last year late well including best picture nominee know saddest thing stupid movie care much money make many cameo sorry as excuse film taking away job actor writer director truly deserve attention lionsgate thought better taste ashamed making kind crap jason friedberg aaron s...
1,1.0,"I am writing this in hopes that this gets put over the previous review of this ""film"". How anyone can find this slop entertaining is completely beyond me. First of all a spoof film entitled ""Disaster Movie"", should indeed be a spoof on disaster films. Now I have seen 1 (yes count them, 1) disaster film being spoofed, that being ""Twister"". How does Juno, Iron Man, Batman, The Hulk, Alvin and the Chipmunks, Amy Winehouse, or Hancock register as Disaster films? Selzterwater and Failburg once again have shown that they lack any sort of writing skill and humor. Having unfortunately been tortured with Date Movie and Epic Movie I know exactly what to expect from these two...no plot, no jokes just bad references and cheaply remade scenes from other films. Someone should have informed them that satire is more than just copy and paste from one film to another, though I shouldn't say that because some of these actually just seem to be taken from trailers.There is nothing clever or witty or re...",Disaster Movie,"Estou escrevendo isso na esperança de que isso seja colocado sobre a revisão anterior deste ""filme"". Como alguém pode achar divertido esse desleixo está completamente além de mim. Antes de mais nada, um filme de paródia intitulado ""Filme de desastre"" deveria ser, de fato, uma paródia de filmes de desastre. Agora eu já vi 1 (sim, conte-os, 1) filme de desastre sendo falsificado, sendo ""Twister"". Como Juno, Homem de Ferro, Batman, O Hulk, Alvin e os Esquilos, Amy Winehouse ou Hancock se registram como filmes de Desastre? Selzterwater e Failburg mostraram mais uma vez que não possuem nenhum tipo de habilidade e humor de escrita. Infelizmente, tendo sido torturado com Date Movie e Epic Movie, sei exatamente o que esperar desses dois ... nenhum enredo, nenhuma piada, apenas más referências e cenas refeitas de outros filmes. Alguém deveria ter informado a eles que a sátira é mais do que apenas copiar e colar de um filme para outro, embora eu não deva dizer isso porque alguns deles realme...",writing hope get put previous review anyone find slop entertaining completely beyond first spoof entitled disaster indeed spoof disaster film seen yes count disaster spoofed twister juno iron man batman hulk alvin chipmunk amy winehouse hancock register disaster film selzterwater failburg shown lack sort writing skill humor unfortunately tortured date epic know exactly expect two plot joke bad reference cheaply remade scene film someone informed satire copy paste another though say actually seem taken trailer nothing clever witty remotely smart way two write believe people still pay see travesty insult audience though enjoy film doubt smart enough realize rating unfortunately not number low enough yes includes negative rate deserves top worst film time right date epic faliure mean meet spartan rather forced hour manos hand fate marathon watch slop
2,1.0,"Really, I could write a scathing review of this turd sandwich, but instead, I'm just going to be making a few observations and points I've deduced.There's just no point in watching these movies anymore. Does any reader out there remember Scary Movie? Remember how it was original with a few comedic elements to it? There was slapstick, some funny lines, it was a pretty forgettable comedy, but it was worth the price of admission. Well, That was the last time this premise was funny. STOP MAKING THESE MOVIES. PLEASE.I could call for a boycott of these pieces of monkey sh*t, but we all know there's going to be a line up of pre pubescent annoying little buggers, spouting crappy one liners like, ""THIS IS SPARTA!"" and, ""IM RICK JAMES BITCH"" so these movies will continue to make some form of monetary gain, considering the production value of this movie looks like it cost about 10 cents to make.Don't see this movie. Don't spend any money on it. Go home, rent Airplane, laugh your ass off, and ...",Disaster Movie,"Realmente, eu poderia escrever uma crítica contundente sobre esse sanduíche de cocô, mas, em vez disso, vou fazer algumas observações e pontos que deduzi. Não há mais sentido assistir a esses filmes. Algum leitor por aí se lembra do filme de terror? Lembra como era original, com alguns elementos cômicos? Havia palhaçada, algumas frases engraçadas, era uma comédia bastante esquecível, mas valia o preço da entrada. Bem, essa foi a última vez que essa premissa foi engraçada. PARE DE FAZER ESTES FILMES. POR FAVOR, eu poderia pedir um boicote a esses pedaços de macaco, mas todos sabemos que haverá uma fila de buggers irritantes e pré-pubescentes, jorrando uns forros ruins como: ""ISTO É SPARTA!"" e ""IM RICK JAMES BITCH"", para que esses filmes continuem gerando algum ganho monetário, considerando que o valor de produção deste filme parece custar cerca de 10 centavos de dólar. Não gaste dinheiro com isso. Vá para casa, alugue a Airplane, ria e julgue silenciosamente as pessoas que estão fal...",really write scathing review turd sandwich instead going making observation point deduced point watching movie anymore reader remember scary remember original comedic element slapstick funny line pretty forgettable comedy worth price admission well last time premise funny stop making movie please call boycott piece monkey sh know going line pre pubescent annoying little bugger spouting crappy liner like sparta im rick james bitch movie continue make form monetary gain considering production value look like cost cent make see spend money go home rent airplane laugh as silently judge people talking monday favor


## Feature Engineering

In [13]:
## Instead of having labels from 1 to 10. Now, we will map positive review to 1 and negative review to 0

df['Label'] = df['Ratings'].apply(lambda x: '1' if x >=7 else ('0' if x <=4 else '2'))



In [14]:
## getting rid of neutral reviews, labelled by '2'

df = df[df['Label'] < '2']
###NOTE: here we can do lexical comparison of strings

In [15]:
df['Label'].value_counts()

Label
0    60000
1    60000
Name: count, dtype: int64

In [16]:
### performing the train test split with only clean review
X_train, X_test, y_train, y_test = train_test_split(df['Clean_Review'], df['Label'], test_size = 0.2, random_state = 42, stratify = df['Label'], shuffle = True)


In [17]:
print(X_train.shape, y_train.shape, X_test.shape, df.shape)

(96000,) (96000,) (24000,) (120000, 6)


In [18]:
countvect = CountVectorizer(analyzer = "word", ngram_range=(1,1), min_df = 10, max_features = 500)
tfidfvect = TfidfVectorizer(analyzer = "word", ngram_range=(1,1), min_df = 10, max_features = 500)

X_train_count = countvect.fit_transform(X_train).toarray()
X_test_count  = countvect.fit_transform(X_test).toarray()
X_train_tfidf = tfidfvect.fit_transform(X_train).toarray()
X_test_tfidf  = tfidfvect.fit_transform(X_test).toarray()



In [19]:
X_train_count.shape

(96000, 500)

###Feature importance using Count Vectorizer and logistic regression
NOTE: each words are taken as feature, as defined by setting analyzer = 'word'



In [25]:
lgr = LogisticRegression()
lgr.fit(X_train_count, y_train)
lgr.score(X_test_count, y_test)

i=0

important_feature = PrettyTable(["Feature", "Score"])

for feature, score in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
  if i < 100:
    important_feature.add_row([feature, score])
    i+=1

print(important_feature)

+----------------+------------------------+
|    Feature     |         Score          |
+----------------+------------------------+
|      able      |  0.09938060758878961   |
|   absolutely   |  0.037048525369790405  |
|      act       |  -0.17220418546468252  |
|     acting     |  -0.1740099984204471   |
|     action     |  0.27129584326022077   |
|     actor      |  -0.13747197501041908  |
|    actress     |  -0.10096493053389838  |
|    actually    | -0.018558319450560405  |
|      add       |  0.050795104588468754  |
|     adult      |  0.03192744788678324   |
|      age       |  0.10773924688115896   |
|     alien      |  -0.03843611250423535  |
|     almost     |  -0.02378733002021912  |
|     alone      |  0.09592874702535234   |
|     along      |  -0.20363293876193875  |
|    already     |  0.11119282125030601   |
|      also      |  0.16744446937227178   |
|    although    |  0.18751656919328502   |
|     always     |   0.8626863111751183   |
|    amazing     |   0.062964442

### Learning from above table
- The table helps you understand which words your model has learned to associate with positive or negative outcomes. This can be useful for verifying that the model's behavior aligns with your expectations.
- You can identify the most relevant features (words) that the model uses for making predictions. This can be valuable for feature selection or understanding model biases.
- If certain words have unexpectedly high or low scores, it might indicate areas where the model's understanding could be improved, potentially guiding further data preprocessing or model tuning.


###Feature importance using tfidf Vectorizer and logistic regression

In [26]:
lgr = LogisticRegression()
lgr.fit(X_train_tfidf, y_train)
lgr.score(X_test_tfidf, y_test)

important_feature = PrettyTable(["Feature", "Score"])

i=0
for feature, score in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
  if i <100:
    important_feature.add_row([feature, score])
    i+=1

print(important_feature)

+----------------+-----------------------+
|    Feature     |         Score         |
+----------------+-----------------------+
|      able      |   0.6363607558155757  |
|   absolutely   |  0.31238500919775253  |
|      act       |  -1.2551677376760397  |
|     acting     |  -1.6263253068123347  |
|     action     |   2.649110473835511   |
|     actor      |  -1.3151738162030147  |
|    actress     |  -0.5701981351472697  |
|    actually    |  -0.18668245928139435 |
|      add       |   0.1542702187777356  |
|     adult      |   0.4040479278820225  |
|      age       |   0.8595928312293508  |
|     alien      |  -0.16147819166904157 |
|     almost     |  -0.3631716014601592  |
|     alone      |   0.6498156541472546  |
|     along      |   -1.185089568921238  |
|    already     |   1.3099748693526854  |
|      also      |   1.0473750835433975  |
|    although    |   1.6657275559844718  |
|     always     |   4.7411221267422485  |
|    amazing     |  0.42456028567299076  |
|    americ

### Comparing tfidf and count vectorizer side by side for bigrams

In [27]:
countvect = CountVectorizer(analyzer = "word", ngram_range=(2,2), min_df = 10, max_features = 500)
tfidfvect = TfidfVectorizer(analyzer = "word", ngram_range=(2,2), min_df = 10, max_features = 500)

X_train_count = countvect.fit_transform(X_train).toarray()
X_test_count  = countvect.fit_transform(X_test).toarray()
X_train_tfidf = tfidfvect.fit_transform(X_train).toarray()
X_test_tfidf  = tfidfvect.fit_transform(X_test).toarray()


In [28]:
lgr_count = LogisticRegression()
lgr_tfidf = LogisticRegression()

lgr_tfidf.fit(X_train_tfidf, y_train)
lgr_tfidf.score(X_test_tfidf, y_test)

lgr_count.fit(X_train_count, y_train)
lgr_count.score(X_test_count, y_test)


important_feature = PrettyTable(["Feature", "Score_tfidf", "Score_count"])

i=0
for feature, score_tfidf, score_count in zip(tfidfvect.get_feature_names_out(), lgr_tfidf.coef_[0], lgr_count.coef_[0]):
   if i < 100:
    important_feature.add_row([feature, score_tfidf, score_count])
    i+=1

print(important_feature)


+-----------------------+-----------------------+-----------------------+
|        Feature        |      Score_tfidf      |      Score_count      |
+-----------------------+-----------------------+-----------------------+
|   absolutely nothing  |   -2.607118367503641  |   -1.308899222581827  |
|     academy award     |   1.8350387460553295  |   0.9315850046291839  |
|        act like       |  -0.7641871095821977  |  -0.37529917309908284 |
|       acting bad      |  -2.3940436181391567  |   -1.213028515491695  |
|      acting good      |   1.1432795269692229  |   0.5750046135649827  |
|      acting great     |   1.9029828141193554  |   1.081159656468725   |
|       acting not      |  -0.6669178040738794  |  -0.2315477320370192  |
|    acting terrible    |  -3.5752713837180146  |  -1.9660095049100577  |
|      action film      |   1.1292989398501467  |  0.34967057484018454  |
|      action flick     |   0.5229125499044008  |  0.15668291607905926  |
|      action movie     |   1.26454124

### Vectorization with unigram, bigram, trigram

 In the context of feature selection for sentiment analysis, the Chi-squared test helps to identify which features (words or n-grams) are most strongly associated with the target classes (e.g., positive or negative reviews).

In [29]:
countvect = CountVectorizer(analyzer = "word", ngram_range=(1,3), min_df = 10, max_features = 500)
tfidfvect = TfidfVectorizer(analyzer = "word", ngram_range=(1,3), min_df = 10, max_features = 500)

X_train_count = countvect.fit_transform(X_train).toarray()
X_test_count  = countvect.fit_transform(X_test).toarray()
X_train_tfidf = tfidfvect.fit_transform(X_train).toarray()
X_test_tfidf  = tfidfvect.fit_transform(X_test).toarray()

In [43]:
from sklearn.feature_selection import chi2
import numpy as np

N = 100
Number = 1

for category in y_train.unique():
    features_chi2 = chi2(X_train_tfidf, y_train == category)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidfvect.get_feature_names_out())[indices]

    unigrams = [x for x in feature_names if len(x.split(' ')) == 1]
    bigrams = [x for x in feature_names if len(x.split(' ')) == 2]
    trigrams = [x for x in feature_names if len(x.split(' ')) == 3]

    print("%s. %s :" % (Number,category))
    print("\t# Unigrams :\n\t. %s" %('\n\t. '.join(unigrams[-N:])))
    print("\t# Bigrams :\n\t. %s" %('\n\t. '.join(bigrams[-N:])))
    print("\t# Trigrams :\n\t. %s" %('\n\t. '.join(trigrams[-N:])))
    Number += 1

1. 0 :
	# Unigrams :
	. someone
	. death
	. called
	. guy
	. trying
	. cut
	. remember
	. element
	. love
	. nice
	. come
	. turn
	. plot
	. pretty
	. title
	. recommend
	. close
	. oh
	. episode
	. original
	. point
	. horrible
	. going
	. human
	. season
	. family
	. huge
	. entire
	. cinematography
	. relationship
	. despite
	. stuff
	. group
	. cinema
	. world
	. light
	. comic
	. none
	. anyone
	. truly
	. although
	. lack
	. already
	. run
	. sure
	. see
	. head
	. dialogue
	. idea
	. least
	. enough
	. around
	. fall
	. take
	. right
	. save
	. hero
	. back
	. job
	. enjoy
	. high
	. perhaps
	. looked
	. american
	. use
	. acting
	. father
	. change
	. ending
	. line
	. couple
	. style
	. well
	. boring
	. etc
	. script
	. action
	. decent
	. wonderful
	. beginning
	. money
	. minute
	. loved
	. full
	. always
	. home
	. nothing
	. performance
	. enjoyable
	. worse
	. low
	. excellent
	. power
	. blood
	. audience
	. thing
	. waste
	. away
	. got
	. worst
	# Bigrams :
	. sci fi
