In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [10]:
# Load the dataset in TSV format
data = pd.read_csv("/content/labeledTrainData.tsv", sep="\t")

In [11]:
data


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
17855,11083_3,0,This film was on late at night when I saw it. ...
17856,8436_4,0,David Mamet is a very interesting and a very u...
17857,5182_4,0,Like most of the festivals entries Hamiltons m...
17858,1459_1,0,This is by far one of the most boring and horr...


In [12]:
data.shape


(17860, 3)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17860 entries, 0 to 17859
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         17860 non-null  object
 1   sentiment  17860 non-null  int64 
 2   review     17860 non-null  object
dtypes: int64(1), object(2)
memory usage: 418.7+ KB


In [14]:
data.describe()

Unnamed: 0,sentiment
count,17860.0
mean,0.499384
std,0.500014
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [15]:
data.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [16]:
# Remove missing values if any
data.dropna(inplace=True)

In [20]:
# Convert text to lowercase
data['review'] = data['review'].str.lower()
data['review']

0        stuff going moment mj ive started listening mu...
1        classic war worlds timothy hines entertaining ...
2        film starts manager nicholas bell giving welco...
3        must assumed praised film greatest filmed oper...
4        superbly trashy wondrously unpretentious explo...
                               ...                        
17855    film late night saw interesting start didnt co...
17856    david mamet interesting unequal director first...
17857    like festivals entries hamiltons makes interes...
17858    far one boring horribly acted accounts early d...
17859    excited seeing film anticipating visual excurs...
Name: review, Length: 17860, dtype: object

In [21]:
# Remove special characters and punctuation
data['review'] = data['review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
data['review']

0        stuff going moment mj ive started listening mu...
1        classic war worlds timothy hines entertaining ...
2        film starts manager nicholas bell giving welco...
3        must assumed praised film greatest filmed oper...
4        superbly trashy wondrously unpretentious explo...
                               ...                        
17855    film late night saw interesting start didnt co...
17856    david mamet interesting unequal director first...
17857    like festivals entries hamiltons makes interes...
17858    far one boring horribly acted accounts early d...
17859    excited seeing film anticipating visual excurs...
Name: review, Length: 17860, dtype: object

In [22]:
# Tokenization and removing stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
data['review']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        stuff going moment mj ive started listening mu...
1        classic war worlds timothy hines entertaining ...
2        film starts manager nicholas bell giving welco...
3        must assumed praised film greatest filmed oper...
4        superbly trashy wondrously unpretentious explo...
                               ...                        
17855    film late night saw interesting start didnt co...
17856    david mamet interesting unequal director first...
17857    like festivals entries hamiltons makes interes...
17858    far one boring horribly acted accounts early d...
17859    excited seeing film anticipating visual excurs...
Name: review, Length: 17860, dtype: object

In [23]:
# Stemming
stemmer = PorterStemmer()
data['review'] = data['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
data['review']

0        stuff go moment mj ive start listen music watc...
1        classic war world timothi hine entertain film ...
2        film start manag nichola bell give welcom inve...
3        must assum prais film greatest film opera ever...
4        superbl trashi wondrous unpretenti exploit hoo...
                               ...                        
17855    film late night saw interest start didnt convi...
17856    david mamet interest unequ director first movi...
17857    like festiv entri hamilton make interest watch...
17858    far one bore horribl act account earli day ado...
17859    excit see film anticip visual excursu relat ar...
Name: review, Length: 17860, dtype: object

In [24]:
# Split data into features (X) and labels (y)
X = data['review']
y = data['sentiment']

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # You can adjust the max_features as needed

# Fit and transform the preprocessed text data
X_bow = vectorizer.fit_transform(X)

# Convert the sparse matrix to a dense numpy array
X_bow = X_bow.toarray()

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # run for 1000 iterations
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)





Accuracy: 0.8460246360582306
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.84      1745
           1       0.85      0.85      0.85      1827

    accuracy                           0.85      3572
   macro avg       0.85      0.85      0.85      3572
weighted avg       0.85      0.85      0.85      3572



In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features as needed

# Fit and transform the preprocessed text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # run for 1000 iterations
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8762597984322509
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.87      1745
           1       0.88      0.88      0.88      1827

    accuracy                           0.88      3572
   macro avg       0.88      0.88      0.88      3572
weighted avg       0.88      0.88      0.88      3572



In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Initialize a list to store the models and their names
models = [
    ("Naive Bayes", MultinomialNB()),
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("SVM", SVC(kernel='linear', C=1)),
    ("XGBoost", XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100))
]

# Loop through each model and evaluate its performance
for model_name, model in models:
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy:.4f}")






Naive Bayes Accuracy: 0.8418
Random Forest Accuracy: 0.8306
SVM Accuracy: 0.8726
XGBoost Accuracy: 0.7993
