In [2]:
import pandas as pd 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score
import joblib

In [3]:
df = pd.read_csv('../artifacts/train.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,good movie,positive
1,movie good,positive
2,terrific movie watch watch gena davis samuel j...,positive
3,i have hundreds silent movies classics nosfera...,positive
4,good engaging cinematic firefights great prese...,positive


In [5]:
df['sentiment'].value_counts(normalize=True)

sentiment
positive    0.500334
negative    0.499666
Name: proportion, dtype: float64

In [6]:
df.shape

(8990, 2)

In [7]:
df.isna().any()

review       False
sentiment    False
dtype: bool

In [8]:
df.duplicated().sum()

0

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
# split the dataset train and test 

train, test = train_test_split(df, test_size = 0.3, stratify = df[['sentiment']], random_state = 0)

In [11]:
# count vectorizer 

def split_into_words(i):
    return [word for word in i.split(" ")]


review_bow = CountVectorizer(analyzer = split_into_words).fit(df.review)

In [12]:
# save the bow model 

joblib.dump(review_bow, '../model/preprocessedBow')

['../model/preprocessedBow']

In [13]:
preprocessedBow = joblib.load('../model/preprocessedBow')

In [14]:
# Convert to count vector 

# train
train_matrix = preprocessedBow.transform(train.review)


# test 
test_matrix = preprocessedBow.transform(test.review)

# Train the model 

In [15]:
navieModel = MultinomialNB()

In [16]:
navieModel.fit(train_matrix, train.sentiment)

# Evaluation 

In [17]:
# test data 

TestPred = navieModel.predict(test_matrix)

In [18]:
print(classification_report(test['sentiment'], TestPred))

              precision    recall  f1-score   support

    negative       0.83      0.85      0.84      1348
    positive       0.85      0.82      0.83      1349

    accuracy                           0.84      2697
   macro avg       0.84      0.84      0.84      2697
weighted avg       0.84      0.84      0.84      2697



In [19]:
# train data 

TrainPred = navieModel.predict(train_matrix)

In [20]:
print(classification_report(train['sentiment'], TrainPred))

              precision    recall  f1-score   support

    negative       0.91      0.97      0.94      3144
    positive       0.97      0.91      0.94      3149

    accuracy                           0.94      6293
   macro avg       0.94      0.94      0.94      6293
weighted avg       0.94      0.94      0.94      6293



# Grid search cv 

In [21]:
model = MultinomialNB()

In [22]:
# Adjust the alpha value 

param_grid = {
    
    'alpha': [0.1, 0.5, 1.0, 5.0]  # Adjust alpha values as needed

}
 

In [23]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5)

grid_search.fit(train_matrix, train.sentiment)

In [24]:
TestPred = grid_search.predict(test_matrix)

print(classification_report(test['sentiment'], TestPred))

              precision    recall  f1-score   support

    negative       0.83      0.86      0.85      1348
    positive       0.85      0.83      0.84      1349

    accuracy                           0.84      2697
   macro avg       0.84      0.84      0.84      2697
weighted avg       0.84      0.84      0.84      2697



In [25]:
TrainPred = grid_search.predict(train_matrix)

print(classification_report(train['sentiment'], TrainPred))

              precision    recall  f1-score   support

    negative       0.89      0.94      0.92      3144
    positive       0.94      0.89      0.91      3149

    accuracy                           0.91      6293
   macro avg       0.91      0.91      0.91      6293
weighted avg       0.91      0.91      0.91      6293



# test data 

In [27]:
testData = pd.read_csv('../artifacts/testData.csv')

In [28]:
testData.shape

(1000, 2)

In [29]:
testX = preprocessedBow.transform(testData['review'])

In [30]:
Test_Pred = grid_search.predict(testX)

print(classification_report(testData['sentiment'], Test_Pred))

              precision    recall  f1-score   support

    negative       0.82      0.85      0.84       500
    positive       0.84      0.82      0.83       500

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000



In [41]:
testTestData = preprocessedBow.transform(['The movie was not bad at all'])

In [42]:
grid_search.predict(testTestData)

array(['negative'], dtype='<U8')

In [40]:
# Save the model 

joblib.dump(grid_search, '../model/sentiment.joblib')

['../model/sentiment.joblib']