In [1]:
import pandas as pd
import numpy as np

In [90]:
df6=pd.read_csv("final_cleaned_dataset.csv")

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

X = df6['cleaned_text']
y = df6['sentiment_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features if needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

#class distribution
print("Original class distribution:", y_train.value_counts())
print("Balanced class distribution:", pd.Series(y_train_balanced).value_counts())

Original class distribution: sentiment_label
1    3147
0     783
Name: count, dtype: int64
Balanced class distribution: sentiment_label
1    3147
0    3147
Name: count, dtype: int64


In [59]:
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_balanced, y_train_balanced)

#evaluate
y_pred = log_reg_model.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.844354018311292

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.75      0.65       193
           1       0.93      0.87      0.90       790

    accuracy                           0.84       983
   macro avg       0.76      0.81      0.78       983
weighted avg       0.86      0.84      0.85       983


Confusion Matrix:
[[145  48]
 [105 685]]


In [84]:
import joblib
joblib.dump(log_reg_model, 'logistic_regression_smote_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

def predict_analysis(text):
    #vectorizer
    text_tfidf = tfidf_vectorizer.transform([text])
    
    #model
    prediction = log_reg_model.predict(text_tfidf)
    
    #map
    sentiment = "Positive" if prediction[0] == 1 else "Negative"
    
    return f"The sentiment of the input text is: {sentiment}"



In [86]:
predict_analysis('awsome')

'The sentiment of the input text is: Negative'

In [87]:
predict_analysis('Well')

'The sentiment of the input text is: Positive'

In [88]:
predict_analysis('Not Well')

'The sentiment of the input text is: Positive'

In [89]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizerLogR.pkl')

['tfidf_vectorizerLogR.pkl']