* NAME: Rishabh Patil
* SAP: 60009200056
* BATCH: D12

#**Lab 2: Implement a Sentiment Analysis on Linguistic Data**

In [None]:
import re
import os
import nltk
import math
import heapq
import gensim
import nltk.corpus
import numpy as np
import string as st
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
nltk.download('all')

**Loading the data**

In [None]:
df = pd.read_csv("/content/IMDB Dataset.csv", engine='python',nrows=2000)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(2000, 2)

**Preprocessing the data**

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return lemma_words

def preprocess_join(li):
    return " ".join(li)

df['preproc_rev']=df['review'].map(lambda s:preprocess(s))
df['preproc_rev_sentence']=df['preproc_rev'].map(lambda s:preprocess_join(s))

In [None]:
df.head()

Unnamed: 0,review,sentiment,preproc_rev,preproc_rev_sentence
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, episod, hook, ri...",one review mention watch episod hook right exa...
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, film, techniqu, unass...",wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe...",thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,"[basic, famili, littl, boy, jake, think, zombi...",basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st...",petter mattei love time money visual stun film...


#**Feature Extraction**

#**1. Bag of Words**

In [None]:
word2count = {}
for data in df['preproc_rev']:
    for word in data:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

freq_words = heapq.nlargest(100, word2count, key=word2count.get)
X = []
for data in df['preproc_rev']:
    vector = []
    for word in freq_words:
        if word in data:
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)
X_bow = np.asarray(X)

In [None]:
df['bag_of_words'] = list(X_bow)
df.head()

Unnamed: 0,review,sentiment,preproc_rev,preproc_rev_sentence,bag_of_words
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, episod, hook, ri...",one review mention watch episod hook right exa...,"[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ..."
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, film, techniqu, unass...",wonder littl product film techniqu unassum old...,"[0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, ..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe...",thought wonder way spend time hot summer weeke...,"[0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, ..."
3,Basically there's a family where a little boy ...,negative,"[basic, famili, littl, boy, jake, think, zombi...",basic famili littl boy jake think zombi closet...,"[1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st...",petter mattei love time money visual stun film...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."


**Test-Train Split**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(list(X_bow), list(df['sentiment']), test_size = 0.2, random_state = 41)

**Logistic Regression**

In [None]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.72      0.69      0.70       196
    positive       0.71      0.74      0.72       204

    accuracy                           0.71       400
   macro avg       0.72      0.71      0.71       400
weighted avg       0.72      0.71      0.71       400



In [None]:
confusion_matrix(Y_test, y_pred)

array([[136,  60],
       [ 54, 150]])

In [None]:
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy of Logistic Regression using Bag of Words: {100*accuracy:.2f} %")

Accuracy of Logistic Regression using Bag of Words: 71.50 %


#**2. TF-IDF**

In [None]:
df['sentiment'].value_counts()

positive    1005
negative     995
Name: sentiment, dtype: int64

In [None]:
X = df['preproc_rev_sentence']
Y = df['sentiment']

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)

In [None]:
X.shape

(2000, 17039)

In [None]:
X

<2000x17039 sparse matrix of type '<class 'numpy.float64'>'
	with 181216 stored elements in Compressed Sparse Row format>

**Test-Train Split**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4, stratify = Y)

In [None]:
X_train.shape, X_test.shape

((1600, 17039), (400, 17039))

**Logistic Regression**

In [None]:
lr = LogisticRegression()

In [None]:
print(X_train)

  (0, 11329)	0.15549766442584273
  (0, 1915)	0.12203338110759833
  (0, 15471)	0.0804278215557911
  (0, 1391)	0.08220586224786493
  (0, 13977)	0.08463013566856516
  (0, 10922)	0.24088943274797786
  (0, 3498)	0.15120033930850418
  (0, 14394)	0.24088943274797786
  (0, 7273)	0.12297070132003193
  (0, 8837)	0.11668558970792138
  (0, 4966)	0.1504091129174525
  (0, 11680)	0.11104605799997921
  (0, 1956)	0.1149301779223111
  (0, 8866)	0.12112404482305597
  (0, 7906)	0.12393778265194805
  (0, 11706)	0.16926535880955157
  (0, 11840)	0.1986622086931271
  (0, 118)	0.08481091069985004
  (0, 4979)	0.10154143358321649
  (0, 10485)	0.4466573109994257
  (0, 1173)	0.11213794921755664
  (0, 7150)	0.1605024180729452
  (0, 11941)	0.17570194016872576
  (0, 342)	0.14743499598653062
  (0, 11207)	0.20272964212779598
  :	:
  (1599, 15160)	0.14922065911275817
  (1599, 6131)	0.050402894514751474
  (1599, 1182)	0.06682559326971982
  (1599, 7576)	0.04917140013388934
  (1599, 16754)	0.05302890741490997
  (1599, 1694

In [None]:
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.82      0.81      0.82       199
    positive       0.81      0.83      0.82       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [None]:
confusion_matrix(Y_test, y_pred)

array([[161,  38],
       [ 35, 166]])

In [None]:
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy of Logistic Regression using TF-IDF: {100*accuracy:.2f} %")

Accuracy of Logistic Regression using TF-IDF: 81.75 %


**Predicting on Custom Data**

In [None]:
lr.predict(tfidf.transform(["AR Rahman Sir deserved an Oscar for 'Roja' back in 1992; not for 'Jai Ho'."]))

array(['positive'], dtype=object)

In [None]:
lr.predict(tfidf.transform(["Oppenheimer was HELL of an experience in there (pun intended)! Blew my mind with its cinematography!"]))

array(['positive'], dtype=object)

#**Word2Vec**

In [None]:
w2v = gensim.models.Word2Vec(df['preproc_rev'], window=10, min_count=2)

In [None]:
print(w2v.wv.index_to_key)

['movi', 'film', 'one', 'like', 'make', 'time', 'get', 'see', 'good', 'watch', 'charact', 'even', 'would', 'stori', 'realli', 'well', 'scene', 'much', 'look', 'show', 'bad', 'way', 'also', 'think', 'play', 'great', 'love', 'peopl', 'first', 'act', 'end', 'thing', 'made', 'know', 'plot', 'want', 'say', 'come', 'work', 'could', 'mani', 'actor', 'never', 'seem', 'littl', 'year', 'seen', 'take', 'two', 'best', 'tri', 'life', 'man', 'ever', 'give', 'better', 'perform', 'use', 'feel', 'actual', 'find', 'someth', 'still', 'part', 'back', 'real', 'director', 'lot', 'funni', 'interest', 'old', 'woman', 'guy', 'enjoy', 'go', 'star', 'new', 'though', 'anoth', 'world', 'day', 'cast', 'origin', 'role', 'set', 'everi', 'quit', 'girl', 'noth', 'turn', 'believ', 'big', 'comedi', 'point', 'live', 'howev', 'around', 'direct', 'effect', 'line', 'fan', 'thought', 'long', 'minut', 'pretti', 'start', 'must', 'got', 'music', 'least', 'script', 'fact', 'laugh', 'wonder', 'right', 'kill', 'done', 'alway', 'com

In [None]:
def document_vector(doc):
    doc = [word for word in doc if word in w2v.wv.index_to_key]
    return np.mean(w2v.wv[doc], axis=0)
df['word2vec'] = df['preproc_rev'].apply(document_vector)
df.head()

Unnamed: 0,review,sentiment,preproc_rev,preproc_rev_sentence,bag_of_words,word2vec
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, episod, hook, ri...",one review mention watch episod hook right exa...,"[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ...","[-0.23736286, 0.47463688, 0.12823063, -0.00721..."
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, film, techniqu, unass...",wonder littl product film techniqu unassum old...,"[0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...","[-0.27653748, 0.53202003, 0.13394889, -0.02087..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe...",thought wonder way spend time hot summer weeke...,"[0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, ...","[-0.24846025, 0.4948961, 0.118055664, -0.00375..."
3,Basically there's a family where a little boy ...,negative,"[basic, famili, littl, boy, jake, think, zombi...",basic famili littl boy jake think zombi closet...,"[1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[-0.24569897, 0.5016247, 0.10228299, 0.0172167..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st...",petter mattei love time money visual stun film...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[-0.26387003, 0.5198512, 0.11847945, -0.006244..."


**Test-Train Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(list(df['word2vec']), df['sentiment'], test_size=0.2, random_state=42)

**Logistic Regression**

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.57      0.56      0.57       195
    positive       0.59      0.59      0.59       205

    accuracy                           0.58       400
   macro avg       0.58      0.58      0.58       400
weighted avg       0.58      0.58      0.58       400



In [None]:
confusion_matrix(y_test, y_pred)

array([[110,  85],
       [ 84, 121]])

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Logistic Regression using Word2Vec: {100*accuracy:.2f} %")

Accuracy of Logistic Regression using Word2Vec: 57.75 %


In [None]:
!jupyter nbconvert --to html "/content/60009200040_ACL_D11_Lab2.ipynb"

[NbConvertApp] Converting notebook /content/60009200040_ACL_D11_Lab2.ipynb to html
[NbConvertApp] Writing 848980 bytes to /content/60009200040_ACL_D11_Lab2.html
