In [26]:
# quick_train.py - Standalone training script to create models for API
# Run this first to generate the ./best_model/ directory

import pandas as pd
import numpy as np
import re
import warnings
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb


In [27]:

warnings.filterwarnings('ignore')
np.random.seed(42)

print(" Quick Model Training for API Deployment")
print("=" * 50)

# Load and preprocess data
print(" Loading dataset...")
df = pd.read_csv('reply_classification_dataset.csv')

if 'reply' in df.columns:
    df = df.rename(columns={'reply': 'text'})

print(f"Dataset shape: {df.shape}")

# Label standardization
def standardize_labels(label):
    if pd.isna(label):
        return None
    label_lower = str(label).lower().strip()
    if label_lower in ['positive', 'pos']:
        return 'positive'
    elif label_lower in ['negative', 'neg']:
        return 'negative' 
    elif label_lower in ['neutral', 'neu']:
        return 'neutral'
    else:
        return label_lower


 Quick Model Training for API Deployment
 Loading dataset...
Dataset shape: (2129, 2)


In [28]:

# Text cleaning
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s!?.,]', '', text)
    return text

print("🔧 Preprocessing data...")
df['label'] = df['label'].apply(standardize_labels)
df = df.dropna(subset=['label'])
df['cleaned_text'] = df['text'].apply(clean_text)
df = df.dropna(subset=['cleaned_text'])
df = df[df['cleaned_text'].str.len() > 0]

print(f"Cleaned dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")

# Filter valid classes
counts = df['label'].value_counts()
valid_classes = counts[counts >= 2].index.tolist()
df_filtered = df[df['label'].isin(valid_classes)].copy()

print(f"Valid classes: {valid_classes}")


🔧 Preprocessing data...
Cleaned dataset shape: (2129, 3)
Label distribution:
label
positive    710
negative    710
neutral     709
Name: count, dtype: int64
Valid classes: ['positive', 'negative', 'neutral']


In [29]:

# Label encoding
label_encoder = LabelEncoder()
df_filtered['label_encoded'] = label_encoder.fit_transform(df_filtered['label'])

# Train/test split
X = df_filtered['cleaned_text']
y = df_filtered['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)}, Test set: {len(X_test)}")

# Train TF-IDF vectorizer
print("\n Training TF-IDF + Logistic Regression...")
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000, C=1.0)
lr_model.fit(X_train_tfidf, y_train)

lr_pred = lr_model.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred, average='weighted')

print(f"Logistic Regression - Accuracy: {lr_accuracy:.4f}, F1: {lr_f1:.4f}")

# Train LightGBM
print(" Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    random_state=42,
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    num_leaves=31,
    verbose=-1
)

lgb_model.fit(X_train_tfidf, y_train)
lgb_pred = lgb_model.predict(X_test_tfidf)
lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred, average='weighted')

print(f"LightGBM - Accuracy: {lgb_accuracy:.4f}, F1: {lgb_f1:.4f}")

# Determine best model
models = {
    'Logistic Regression': {'accuracy': lr_accuracy, 'f1': lr_f1, 'model': lr_model},
    'LightGBM': {'accuracy': lgb_accuracy, 'f1': lgb_f1, 'model': lgb_model}
}

best_model_name = max(models.keys(), key=lambda x: models[x]['f1'])
best_f1 = models[best_model_name]['f1']

print(f"\n Best Model: {best_model_name} (F1: {best_f1:.4f})")

# Save the best model
print(" Saving model files...")
os.makedirs('./best_model', exist_ok=True)

# Always save these files
joblib.dump(label_encoder, './best_model/label_encoder.pkl')
joblib.dump(tfidf, './best_model/tfidf_vectorizer.pkl')

if best_model_name == 'LightGBM':
    joblib.dump(lgb_model, './best_model/lgb_model.pkl')
    print(" LightGBM model saved!")
else:
    joblib.dump(lr_model, './best_model/lr_model.pkl')
    print(" Logistic Regression model saved!")

print(f"\n Model files saved in './best_model/' directory:")
for file in os.listdir('./best_model'):
    print(f"   - {file}")

print(f"\nTraining completed! You can now run your FastAPI server.")
print(f"Model Summary:")
print(f"   - Best Model: {best_model_name}")
print(f"   - F1 Score: {best_f1:.4f}")
print(f"   - Classes: {list(label_encoder.classes_)}")
print(f"   - Total samples: {len(df_filtered)}")


Training set: 1703, Test set: 426

 Training TF-IDF + Logistic Regression...
Logistic Regression - Accuracy: 0.9930, F1: 0.9930
 Training LightGBM...
LightGBM - Accuracy: 0.9930, F1: 0.9929

 Best Model: Logistic Regression (F1: 0.9930)
 Saving model files...
 Logistic Regression model saved!

 Model files saved in './best_model/' directory:
   - label_encoder.pkl
   - lr_model.pkl
   - tfidf_vectorizer.pkl

Training completed! You can now run your FastAPI server.
Model Summary:
   - Best Model: Logistic Regression
   - F1 Score: 0.9930
   - Classes: ['negative', 'neutral', 'positive']
   - Total samples: 2129


In [30]:

# Test prediction function
print(f"\n Testing prediction...")
test_text = "This product is amazing, I love it!"
cleaned_test = clean_text(test_text)
text_tfidf = tfidf.transform([cleaned_test])

if best_model_name == 'LightGBM':
    pred = lgb_model.predict(text_tfidf)[0]
else:
    pred = lr_model.predict(text_tfidf)[0]

predicted_label = label_encoder.inverse_transform([pred])[0]
print(f"Test text: '{test_text}'")
print(f"Predicted: {predicted_label}")



 Testing prediction...
Test text: 'This product is amazing, I love it!'
Predicted: positive
