#Importing libreries

In [1]:
import numpy as np
import pandas as pd

#Reading the dataset

In [2]:
data = pd.read_csv('/content/labeledTrainData.tsv', sep='\t')

In [3]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
data.shape

(25000, 3)

In [5]:
data=data[['sentiment','review']]

In [6]:
data.head(5)

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


#Data Preprocessing

In [7]:
import string
import nltk

In [8]:
def remove_punctuation(text):
  punctuationfree=' '
  punctuationfree="".join([i for i in text if i not in string.punctuation])
  return punctuationfree

data['clean_review']=data['review'].apply(lambda x:remove_punctuation(x))

In [9]:
data.head(5)

Unnamed: 0,sentiment,review,clean_review
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...


In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#Word tokanization

In [11]:

import nltk
nltk.download('punkt')
def tokenization(text):
  tokens=nltk.word_tokenize(text)
  return tokens

data['tokenized_review']=data['clean_review'].apply(lambda x:tokenization(x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
data.head(5)

Unnamed: 0,sentiment,review,clean_review,tokenized_review
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[With, all, this, stuff, going, down, at, the,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[The, Classic, War, of, the, Worlds, by, Timot..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[The, film, starts, with, a, manager, Nicholas..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[It, must, be, assumed, that, those, who, prai..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[Superbly, trashy, and, wondrously, unpretenti..."


#Removing Stopwords

In [13]:
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
  output= [i for i in text if i not in stopwords]
  return output

data['remove_stopwords']=data['tokenized_review'].apply(lambda x:remove_stopwords(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
data.head()

Unnamed: 0,sentiment,review,clean_review,tokenized_review,remove_stopwords
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[With, all, this, stuff, going, down, at, the,...","[With, stuff, going, moment, MJ, ive, started,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[The, Classic, War, of, the, Worlds, by, Timot...","[The, Classic, War, Worlds, Timothy, Hines, en..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[The, film, starts, with, a, manager, Nicholas...","[The, film, starts, manager, Nicholas, Bell, g..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[It, must, be, assumed, that, those, who, prai...","[It, must, assumed, praised, film, greatest, f..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[Superbly, trashy, and, wondrously, unpretenti...","[Superbly, trashy, wondrously, unpretentious, ..."


#Lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer

#Capitalize 'W' in WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer=WordNetLemmatizer()
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

data['lemmatized_review']=data['remove_stopwords'].apply(lambda x:lemmatizer(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
data.head()

Unnamed: 0,sentiment,review,clean_review,tokenized_review,remove_stopwords,lemmatized_review
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[With, all, this, stuff, going, down, at, the,...","[With, stuff, going, moment, MJ, ive, started,...","[With, stuff, going, moment, MJ, ive, started,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[The, Classic, War, of, the, Worlds, by, Timot...","[The, Classic, War, Worlds, Timothy, Hines, en...","[The, Classic, War, Worlds, Timothy, Hines, en..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[The, film, starts, with, a, manager, Nicholas...","[The, film, starts, manager, Nicholas, Bell, g...","[The, film, start, manager, Nicholas, Bell, gi..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[It, must, be, assumed, that, those, who, prai...","[It, must, assumed, praised, film, greatest, f...","[It, must, assumed, praised, film, greatest, f..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[Superbly, trashy, and, wondrously, unpretenti...","[Superbly, trashy, wondrously, unpretentious, ...","[Superbly, trashy, wondrously, unpretentious, ..."


#Text Corpus

In [17]:
corpus = [' '.join(tokens) for tokens in data['lemmatized_review']]

In [18]:
corpus[0:10]

['With stuff going moment MJ ive started listening music watching odd documentary watched The Wiz watched Moonwalker Maybe want get certain insight guy thought really cool eighty maybe make mind whether guilty innocent Moonwalker part biography part feature film remember going see cinema originally released Some subtle message MJs feeling towards press also obvious message drug bad mkaybr br Visually impressive course Michael Jackson unless remotely like MJ anyway going hate find boring Some may call MJ egotist consenting making movie BUT MJ fan would say made fan true really nice himbr br The actual feature film bit finally start 20 minute excluding Smooth Criminal sequence Joe Pesci convincing psychopathic powerful drug lord Why want MJ dead bad beyond Because MJ overheard plan Nah Joe Pescis character ranted wanted people know supplying drug etc dunno maybe hate MJs musicbr br Lots cool thing like MJ turning car robot whole Speed Demon sequence Also director must patience saint came

In [19]:
data.columns

Index(['sentiment', 'review', 'clean_review', 'tokenized_review',
       'remove_stopwords', 'lemmatized_review'],
      dtype='object')

#Vectorization

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))
x=cv.fit_transform(corpus)
y=data.sentiment

#Importing RandomForestClasssifier and Train_test_split

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
clf=RandomForestClassifier()
clf.fit(x_train,y_train)

#Printing Accuracy using Metrics

In [23]:
from sklearn import metrics
y_pred=clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_pred,y_test))

Accuracy: 0.8569333333333333


#Finding the sentiment

In [24]:
def find_sentiment(text):
  input_text = cv.transform([text])

  pred = clf.predict(input_text)[0]
  if pred == 1:
    return "Negative"
  elif pred == 0:
    return "Positive"

In [25]:
input= 'A friend mine bought film £1 even grossly overpriced Despite featuring big name Adam Sandler Billy Bob Thornton incredibly talented Burt Young'
find_sentiment(input)

'Negative'

In [26]:
input= 'A friend mine bought film £1 even grossly overpriced Despite featuring big name Adam Sandler Billy Bob Thornton'
find_sentiment(input)

'Negative'

#TFIDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
def vectorizer_tfidf(corpus):
  tfidf_model=TfidfVectorizer()
  tfidf_matrix=tfidf_model.fit_transform(corpus).todense()
  tfidf_df=pd.DataFrame(tfidf_matrix)
  tfidf_df.columns=sorted(tfidf_model.vocabulary_)
  return tfidf_df

In [29]:
df1=vectorizer_tfidf(corpus)

In [30]:
df1

Unnamed: 0,00,000,0000000000001,000001,00000110,0001,00015,001,0010,002,...,étcother,évery,êxtase,ís,ísnt,østbye,über,überannoying,überspy,üvegtigris
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Modelling

In [31]:
tfidf=TfidfVectorizer(ngram_range=(1,2))
x=tfidf.fit_transform(corpus)
y=data.sentiment

In [None]:
clft=RandomForestClassifier(n_estimators=100)
clft.fit(x_train,y_train)

In [None]:
y_pred=clft.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_pred,y_test))

models were found to have accuracy of 0.85 and 0.8134490238611713 respectively


#LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam

In [None]:
#build the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=maxlen)) # Reduced output dimension
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.2), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
history = model.fit(x_train, y_train, batch_size=16, epochs=3, validation_data=(x_val, y_val), verbose=1)