Import libraries

In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [104]:
nltk.download('stopwords')
stop_words =set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [105]:
#load the dataset
data = pd.read_csv('/content/labeledTrainData.tsv',sep='\t')

Data Preprocessing

In [106]:
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [107]:
# Function to clean the review text
def clean_review_alternate(review):
    # Remove HTML tags
    review_text = re.sub(r'<.*?>', ' ', review)  # Simple regex to remove HTML tags

    # Remove non-letter characters and convert to lowercase
    review_text = re.sub("[^a-zA-Z]", " ", review_text).lower()

    # Tokenize the text
    words = word_tokenize(review_text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stop_words]

    # Join the words back into one string
    return " ".join(meaningful_words)

In [108]:
# Clean all the reviews in the dataset using the alternate method
data['cleaned_review'] = data['review'].apply(clean_review_alternate)

In [109]:
# Display the first few cleaned reviews to verify
print(data[['review', 'cleaned_review']].head())

                                              review  \
0  With all this stuff going down at the moment w...   
1  \The Classic War of the Worlds\" by Timothy Hi...   
2  The film starts with a manager (Nicholas Bell)...   
3  It must be assumed that those who praised this...   
4  Superbly trashy and wondrously unpretentious 8...   

                                      cleaned_review  
0  stuff going moment mj started listening music ...  
1  classic war worlds timothy hines entertaining ...  
2  film starts manager nicholas bell giving welco...  
3  must assumed praised film greatest filmed oper...  
4  superbly trashy wondrously unpretentious explo...  


Feature Extraction using TF-IDF

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [111]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words = 'english')
#fit and transform the cleaned review
x = tfidf_vectorizer.fit_transform(data['cleaned_review']).toarray()
y = data['sentiment']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

Model Building and Evaluation using Logistic regression

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [113]:
#logistic regression model
model = LogisticRegression()
model.fit(x_train,y_train)

In [114]:
#model prediction
y_pred_lr = model.predict(x_test)


In [115]:
# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8808

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88      2481
           1       0.87      0.89      0.88      2519

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



Neural Network

In [116]:
from keras.models import Sequential
from keras.layers import Dense

In [117]:
#define the neural network model
model1 = Sequential()
model1.add(Dense(128,input_dim = x_train.shape[1],activation = 'relu'))
model1.add(Dense(1,activation = 'sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [118]:
#compile the model
model1.compile(optimizer = 'adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [119]:
#train the model
model1.fit(x_train,y_train,epochs = 5,batch_size = 512, validation_data = (x_test,y_test))

Epoch 1/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 59ms/step - accuracy: 0.7286 - loss: 0.6540 - val_accuracy: 0.8524 - val_loss: 0.4951
Epoch 2/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.8705 - loss: 0.4424 - val_accuracy: 0.8684 - val_loss: 0.3627
Epoch 3/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.8934 - loss: 0.3173 - val_accuracy: 0.8766 - val_loss: 0.3091
Epoch 4/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.9098 - loss: 0.2588 - val_accuracy: 0.8792 - val_loss: 0.2878
Epoch 5/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9192 - loss: 0.2310 - val_accuracy: 0.8800 - val_loss: 0.2805


<keras.src.callbacks.history.History at 0x79651334a6b0>

In [120]:
#Evaluate the model
NN_loss,NN_accuracy = model1.evaluate(x_test,y_test)
print('Neural Network Accuracy',NN_accuracy)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8789 - loss: 0.2777
Neural Network Accuracy 0.8799999952316284


Both the Logistic regression model and Neural network model performed well on the sentimental analysis.Logistic regression slightly well performed than neural network,making it a strong choice for this specific dataset