In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Data preparation
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(data['cleaned']).toarray()
y = data['Sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
lr_model = LogisticRegression(random_state=123, solver='lbfgs', max_iter=5000)

# Train the model
print("Training the Logistic Regression model...")
lr_model.fit(X_train, y_train)

# Evaluate the model on the test set
print("Evaluating the model...")
y_pred = lr_model.predict(X_test)

# Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Visualize some predictions
sample_indices = np.random.choice(len(X_test), 5, replace=False)
for idx in sample_indices:
    print(f"Original Text: {data['Comments'].iloc[idx]}")
    print(f"Cleaned Text: {data['cleaned'].iloc[idx]}")
    print(f"Actual Sentiment: {y_test.iloc[idx]}")
    print(f"Predicted Sentiment: {y_pred[idx]}")
    print("-" * 50)

# Save the model and vectorizer
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(lr_model, f)
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("Model and vectorizer saved successfully!")


KeyError: "None of [Index(['Comments', 'Sentiment'], dtype='object')] are in the [columns]"