In [1]:
# Importing basic libraries
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
# Reading the dataset
df = pd.read_csv('labeledTrainData.tsv', delimiter = '\t')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [4]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
# Split the data into train and test.
from sklearn.model_selection import train_test_split
train, test = train_test_split(df[['review','sentiment']], test_size=0.2, random_state=42, shuffle=True)

In [6]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [7]:
# Basic Preprocessing
import re
def remove_hashtags(text):
    cleaned_text = re.sub(r'#\w+', '', text)
    return cleaned_text

def remove_urls(text):
    url_pattern = re.compile(r'http?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
    
import string
def remove_punc(text):
    punc_free = ''.join([i for i in text if i not in string.punctuation])
    return punc_free

import nltk
def tokenization(text):
    words = nltk.word_tokenize(text)
    return words

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
def lemm(text):
    lemm_text = [wordnet_lem.lemmatize(word) for word in text]
    return lemm_text

In [8]:
def preprocess(df_col):
    corpus = []
    for item in df_col:
        new_item = remove_hashtags(item)
        new_item = remove_urls(new_item)
        new_item = remove_punc(item)
        new_item = new_item.lower()
        new_item = tokenization(new_item)
        new_item = remove_stopwords(new_item)
        new_item = lemm(new_item)
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus

In [9]:
corpus = preprocess(train['review'])

USING TF-IDF

In [10]:
# Using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2))
traindata = tfidf.fit_transform(corpus)

In [11]:
# Splitting the data into X and y
X = traindata
y = train['sentiment']

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [13]:
tfidf_rf = RandomForestClassifier(max_depth = None, min_samples_leaf =1, min_samples_split = 5, n_estimators =100,n_jobs=-1)

In [14]:
tfidf_rf.fit(X,y)

In [15]:
# Preprocessing the test data.
test_corpus = preprocess(test['review'])

In [16]:
test_data = tfidf.transform(test_corpus)

In [17]:
test_label = test['sentiment']

In [18]:
predictions = tfidf_rf.predict(test_data)

In [19]:
# Prediction with test data.
rf_accuracy = accuracy_score(test_label,predictions)
print("The accuracy is for Random Forest:  ",rf_accuracy*100)

The accuracy is for Random Forest:   85.84


Using Gensim pretrained embeddings.

In [20]:
df1 = pd.read_csv('labeledTrainData.tsv', delimiter = '\t')

In [21]:
# Basic Preprocessing
import re
def remove_hashtags_mentions(text):
    cleaned_text = re.sub(r'#\w+', '', text)
    cleaned_text = re.sub(r'@[\w]*', '', cleaned_text)

    return cleaned_text

def remove_urls(text):
    url_pattern = re.compile(r'http?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
    
import string
def remove_punc(text):
    punc_free = ''.join([i for i in text if i not in string.punctuation])
    return punc_free

import nltk
def tokenization(text):
    words = nltk.word_tokenize(text)
    return words

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
def lemm(text):
    lemm_text = [wordnet_lem.lemmatize(word) for word in text]
    return lemm_text

In [22]:
def preprocess(df_col):
    corpus = []
    for item in df_col:
        new_item = remove_hashtags_mentions(item)
        new_item = remove_urls(new_item)
        new_item = remove_punc(new_item)
        new_item = new_item.lower()
        new_item = tokenization(new_item)
        new_item = remove_stopwords(new_item)
        new_item = lemm(new_item)
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus

In [23]:
corpus = preprocess(df1['review'])

In [24]:
import gensim.downloader
word_embeddings = gensim.downloader.load('glove-wiki-gigaword-100')

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [26]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(corpus)

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
# Pad sequences to a fixed length
max_sequence_length = 20
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [29]:
# Create an embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

In [30]:
import numpy as np
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [31]:
for word, i in tokenizer.word_index.items():
    if word in word_embeddings:
        embedding_matrix[i] = word_embeddings[word]

Model using LSTM

In [32]:
from tensorflow.keras.layers import Embedding
# Build the Embedding layer using the embedding matrix
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False
)

In [33]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df1['sentiment'] , test_size=0.2, random_state=42)

In [34]:
# Converting train and test to tensors
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(128)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(128)

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

# Create a model
model_lstm = Sequential()
model_lstm.add(embedding_layer)
model_lstm.add(Bidirectional(LSTM(128, return_sequences=True)))
model_lstm.add(Dropout(0.5))
model_lstm.add(Bidirectional(LSTM(64)))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(32, activation='relu'))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile the model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           11216800  
                                                                 
 bidirectional (Bidirectiona  (None, 20, 256)          234496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 20, 256)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 32)                4

In [36]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor="val_loss", patience=5)

In [37]:
# Train the model
model_lstm.fit(train_dataset, epochs=10, validation_data=test_dataset,callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<keras.callbacks.History at 0x7f72cc5b4c40>

In [38]:
# Display the accuracy score
_, lstm_accuracy = model_lstm.evaluate(X_test, y_test)
print("Final accuracy for LSTM: ", lstm_accuracy*100)

Final accuracy for LSTM:  76.16000175476074


Model using GRU

In [39]:
from tensorflow.keras.layers import GRU

In [40]:
# Create a GRU model
model_gru = Sequential()
model_gru.add(embedding_layer)
model_gru.add(Bidirectional(GRU(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model_gru.add(Bidirectional(GRU(64, dropout=0.5, recurrent_dropout=0.5)))
model_gru.add(Dense(32, activation='relu'))
model_gru.add(Dense(1, activation='sigmoid'))

# Compile the model
model_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model_gru.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           11216800  
                                                                 
 bidirectional_2 (Bidirectio  (None, 20, 256)          176640    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              123648    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 11,521,249
Trainable params: 304,449
Non

In [41]:
early_stopping = EarlyStopping(monitor="val_loss", patience=5)

In [42]:
model_gru.fit(train_dataset, epochs=10, validation_data=test_dataset,callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f72d06896c0>

In [43]:
_, gru_accuracy = model_gru.evaluate(X_test, y_test)
print("Final accuracy for GRU: ", gru_accuracy*100)

Final accuracy for GRU:  77.99999713897705


Evaluate the model

In [44]:
def predict_sentiment(sentence,model):
    sentence = preprocess(sentence)
    sentence = tokenizer.texts_to_sequences([sentence])
    sentence = pad_sequences(sentence, maxlen=max_sequence_length, padding='post')
    prob = model.predict(sentence)[0][0]
    threshold = 0.5
    sentiment = 'Good' if prob >= threshold else 'Bad'

    print(f'Probability: {prob:.3f}')
    print(f'Sentiment: {sentiment}')

In [49]:
review = "The movie was great."
model = model_lstm
predict_sentiment(review,model)

Probability: 0.334
Sentiment: Bad


In [46]:
review = "I did not like the movie very much."
model = model_gru
predict_sentiment(review,model)

Probability: 0.477
Sentiment: Bad


Display the accuracy of various models

In [47]:
accuracy_data = {
    'model_name': ['Random Forest', 'LSTM', 'GRU'],
    'accuracy': [rf_accuracy*100, lstm_accuracy*100, gru_accuracy*100]
}
accuracy = pd.DataFrame(accuracy_data)
accuracy

Unnamed: 0,model_name,accuracy
0,Random Forest,85.84
1,LSTM,76.160002
2,GRU,77.999997
