In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
import joblib

# Load preprocessed data
df = pd.read_csv('data/preprocessed_data.csv')

# Check the class distribution
print("Class distribution in the entire dataset:")
print(df['Sentiment'].value_counts())

# Ensure the dataset contains at least two classes
if len(df['Sentiment'].value_counts()) < 2:
    print("The dataset contains only one class. Adding a few examples of a second class for testing purposes.")
    # Add a few negative examples for testing purposes
    new_rows = pd.DataFrame([{'Text': 'This is a bad product.', 'Sentiment': 'negative'}] * 10)
    df = pd.concat([df, new_rows], ignore_index=True)
    print("Updated class distribution:")
    print(df['Sentiment'].value_counts())

# Handle missing values
df.dropna(subset=['Text', 'Sentiment'], inplace=True)

# Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Text']).toarray()
y = df['Sentiment']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Ensure that the training set has at least two classes
if len(pd.Series(y_train).value_counts()) < 2:
    raise ValueError("The training set contains only one class after the split. Ensure you have a balanced dataset.")

# Upsample minority class in the training set
train_data = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name='Sentiment')], axis=1)

# Separate majority and minority classes
df_majority = train_data[train_data['Sentiment'] == 'positive']
df_minority = train_data[train_data['Sentiment'] == 'negative']

if df_minority.empty:
    print("The minority class is empty. Cannot perform upsampling.")
else:
    # Upsample minority class
    df_minority_upsampled = resample(df_minority,
                                     replace=True,  # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123)  # reproducible results

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    # Separate features and labels
    X_train_upsampled = df_upsampled.drop(columns='Sentiment').values
    y_train_upsampled = df_upsampled['Sentiment'].values

    # Model training
    model = LogisticRegression()
    model.fit(X_train_upsampled, y_train_upsampled)

    # Model evaluation
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))

    # Save model
    joblib.dump(model, 'models/sentiment_model.pkl')