In [1]:
import numpy as np
import pandas as pd 
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('./call_of_duty_reviews_50000.csv')
df.head()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['content'] = df['content'].apply(preprocess_text)

In [4]:
def assign_sentiment_label(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

# Apply the sentiment labeling
df['sentiment'] = df['score'].apply(assign_sentiment_label)
df.head()

Unnamed: 0,userName,content,score,sentiment
0,User101,best mobile game ever,5,positive
1,User102,really good game im going rate ad thank god v ...,5,positive
2,User103,call duty mobile interesting game gaming mostl...,5,positive
3,User104,love game entirety thing dont like censorship ...,3,neutral
4,User105,unplayable size portable update huge still mak...,1,negative


In [29]:
df.shape

(50000, 4)

In [31]:
df.isnull().sum()

userName     0
content      0
score        0
sentiment    0
dtype: int64

In [5]:
# (X) and labels (y)
X = df['content']
y = df['sentiment']

# 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
X_train

38094    call duty mobile season best game world want s...
40624    keep logging account every update short circui...
49425                                           game liked
35734                                      worst game ever
41708    play people pay weapon get super nerfed people...
                               ...                        
11284                                             love cod
44732    little christmas money went buy cod point game...
38158    game outstanding nowadays face problem auto cl...
860                                 best mobile game world
15795    download got started show happens got download...
Name: content, Length: 35000, dtype: object

In [7]:
 X_test

33553    game used best ive always loyal codm sometimes...
9427                    best mobile game ever cp expensive
199      gone hill fast try joining game hasnt started ...
12447                                     competitive game
39489    thr best game ever given star cuz sometimes tg...
                               ...                        
15168                                          outstanding
49241    anyone help reset login detail since last upda...
39317    please solve network errror problem even speed...
42191                                                     
15109    call duty mobile interesting game gaming mostl...
Name: content, Length: 15000, dtype: object

In [8]:
y_train

38094    positive
40624     neutral
49425    positive
35734    negative
41708    negative
           ...   
11284    positive
44732    positive
38158    positive
860      positive
15795    negative
Name: sentiment, Length: 35000, dtype: object

In [9]:
 y_test

33553    negative
9427     positive
199      negative
12447    positive
39489     neutral
           ...   
15168    positive
49241    positive
39317    negative
42191    positive
15109    positive
Name: sentiment, Length: 15000, dtype: object

In [10]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_train_tfidf, X_test_tfidf

(<35000x561 sparse matrix of type '<class 'numpy.float64'>'
 	with 343552 stored elements in Compressed Sparse Row format>,
 <15000x561 sparse matrix of type '<class 'numpy.float64'>'
 	with 149280 stored elements in Compressed Sparse Row format>)

In [11]:
# Initialize Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print(f"Naive Bayes Accuracy: {accuracy_nb * 100:.2f}%")

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.98      0.99      3830
     neutral       1.00      1.00      1.00      1331
    positive       0.99      1.00      1.00      9839

    accuracy                           0.99     15000
   macro avg       1.00      0.99      1.00     15000
weighted avg       0.99      0.99      0.99     15000

Naive Bayes Accuracy: 99.47%


In [12]:
# Initialize Logistic Regression classifier
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
print(f"Logistic Regression Accuracy: {accuracy_lr * 100:.2f}%")

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      3830
     neutral       1.00      1.00      1.00      1331
    positive       1.00      1.00      1.00      9839

    accuracy                           1.00     15000
   macro avg       1.00      1.00      1.00     15000
weighted avg       1.00      1.00      1.00     15000

Logistic Regression Accuracy: 100.00%


In [13]:
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Models and vectorizer saved successfully!")

Models and vectorizer saved successfully!
