In [None]:
# 1. Libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

# 2. Load Dataset
df = pd.read_csv('../data/Tweets.csv')
df = df[['text', 'airline_sentiment']]
df.columns = ['text', 'label']

# 3. Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text

df['clean_text'] = df['text'].apply(clean_text)

# 4. Visualizations
for sentiment in ['positive', 'negative']:
    text = " ".join(df[df['label'] == sentiment]['clean_text'])
    wordcloud = WordCloud(width=600, height=400, background_color='white').generate(text)
    wordcloud.to_file(f'../outputs/wordcloud_{sentiment}.png')

# 5. Vectorize and Split
tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 6. Train Model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 7. Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 8. Save Model
with open('../models/sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)


              precision    recall  f1-score   support

    negative       0.80      0.94      0.86      1835
     neutral       0.63      0.47      0.54       620
    positive       0.82      0.56      0.67       473

    accuracy                           0.78      2928
   macro avg       0.75      0.66      0.69      2928
weighted avg       0.77      0.78      0.76      2928

