In [None]:
# pip install --upgrade pandas scipy pyarrow numexpr bottleneck nltk

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

  from pandas.core import (


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


In [3]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Load dataset
file_path = "final_data_preprocessed.csv"
df = pd.read_csv(file_path)

In [5]:
 df['Sentiment'].value_counts()

Sentiment
 1.0    89844
 0.0    87124
-1.0    79651
Name: count, dtype: int64

In [6]:
# Ensure there are no missing values
df.dropna(subset=['Sentence', 'Sentiment'], inplace=True)


In [7]:
regex_pattern = re.compile(r'[^a-zA-Z\s]')

# Preprocess function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = regex_pattern.sub('', text)  # Remove special characters
    words = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

In [8]:
# from joblib import Parallel, delayed

# Parallelize preprocessing
# n_jobs = -1 -> Tells to joblib to use all the available CPU cores for parallel processing
# df['cleaned_text'] = Parallel(n_jobs=-1)(delayed(preprocess_text)(text) for text in df['Sentence'])
df['cleaned_text'] = df['Sentence'].apply(preprocess_text)

In [9]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(df['Sentence'], df['Sentiment'], test_size=0.2, random_state=42, stratify=df['Sentiment'])

Replace **TfidfVectorizer** with more efficient alternatives like **HashingVectorizer** or **CountVectorizer** if memory usage is a concern. **HashingVectorizer is particularly useful for large datasets because it doesn't store the vocabulary in memory.**

In [10]:
# Build a pipeline
logistic_model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression(max_iter=500))  # Increased max_iter to 500
])


In [11]:
# Train the model
logistic_model.fit(X_train, y_train)


In [12]:
# Predictions
y_pred = logistic_model.predict(X_test)

In [13]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8953121346738369
Classification Report:
               precision    recall  f1-score   support

        -1.0       0.91      0.85      0.88     15930
         0.0       0.88      0.92      0.90     17425
         1.0       0.90      0.92      0.91     17969

    accuracy                           0.90     51324
   macro avg       0.90      0.89      0.89     51324
weighted avg       0.90      0.90      0.90     51324



In [18]:
import pickle

with open("model.pkl", "wb") as file:
    pickle.dump(logistic_model, file)

In [14]:
# Test on new text
new_text = ["The product was excellent and exceeded my expectations.", "I had a terrible experience with the service."]
new_text_cleaned = [preprocess_text(text) for text in new_text]
predictions_logistic = logistic_model.predict(new_text_cleaned)
print("Predictions:", predictions_logistic )


Predictions: [ 1. -1.]


In [15]:

# Test on positive new text
new_text = ["The product was excellent and exceeded my expectations.","I absolutely loved the product! It was amazing.",
"The service was excellent and very fast.","I had a wonderful experience, and I would highly recommend it."]
new_text_cleaned = [preprocess_text(text) for text in new_text]
predictions_logistic_p = logistic_model.predict(new_text_cleaned)
print("Predictions:", predictions_logistic_p )

Predictions: [1. 1. 1. 1.]


In [16]:

# Test on negavite new text
new_text = ["The worst experience I have ever had. Totally disappointed.","I will never buy from this brand again. Terrible quality.",
"The customer service was rude and unhelpful.", "I had a terrible experience with the service."]
new_text_cleaned = [preprocess_text(text) for text in new_text]
predictions_logistic_n = logistic_model.predict(new_text_cleaned)
print("Predictions:", predictions_logistic_n)


Predictions: [-1. -1.  0. -1.]


In [17]:
# Test on new text
new_text = ["The product is not good as expected","It was an average experience, not too bad, not too great.",
"The delivery was on time, but the packaging was ordinary."]
new_text_cleaned = [preprocess_text(text) for text in new_text]
predictions_logistic_N = logistic_model.predict(new_text_cleaned)
print("Predictions:", predictions_logistic_N)

Predictions: [ 1. -1. -1.]
