In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Visualize sentiment distribution
sns.set(font_scale=1.4)
data['Sentiment'].value_counts().plot(kind='barh', figsize=(9, 3), color='skyblue')
plt.xlabel("Number of Comments")
plt.ylabel("Sentiment Class")
plt.title("Dataset Distribution")
plt.show()

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(data['cleaned']).toarray()
y = data['Sentiment']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a simple model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate model performance
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save vectorizer and model for future use
import pickle
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)
