In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

dataset_path =r"P:\IBM\Restaurant_Reviews 1.tsv"
data = pd.read_csv(dataset_path, delimiter='\t', quoting=3)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text) 
    text = text.lower() 
    text = text.split() 
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]  
    return ' '.join(text)

data['Cleaned_Review'] = data['Review'].apply(clean_text)

vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2)) 
X = vectorizer.fit_transform(data['Cleaned_Review']).toarray()
y = data['Liked']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = LogisticRegression(random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10],  
    'penalty': ['l2'],        
    'solver': ['lbfgs']   
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

import joblib
joblib.dump(best_model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 76.00%
Confusion Matrix:
[[72 28]
 [20 80]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75       100
           1       0.74      0.80      0.77       100

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200



['tfidf_vectorizer.pkl']