In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ü§ñ Sentiment Analysis - Model Training\n",
    "\n",
    "This notebook trains and evaluates multiple ML models for sentiment classification.\n",
    "\n",
    "## Models:\n",
    "- Naive Bayes\n",
    "- Logistic Regression\n",
    "- Support Vector Machine (SVM)\n",
    "- Random Forest\n",
    "- LSTM (Deep Learning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "from sklearn.metrics import (classification_report, confusion_matrix, \n",
    "                            accuracy_score, precision_recall_fscore_support)\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"‚úÖ Libraries imported\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.data_loader import DataLoader\n",
    "from src.preprocess import TextPreprocessor, download_nltk_data\n",
    "from src.features import SimpleFeatureExtractor\n",
    "\n",
    "# Download NLTK data\n",
    "download_nltk_data()\n",
    "\n",
    "# Load data\n",
    "loader = DataLoader()\n",
    "df = loader.load_twitter_data()\n",
    "\n",
    "print(f\"Loaded {len(df)} samples\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess\n",
    "preprocessor = TextPreprocessor(remove_stopwords=True, lemmatize=True)\n",
    "df = preprocessor.preprocess_dataframe(df, text_column='text')\n",
    "\n",
    "# Extract features\n",
    "extractor = SimpleFeatureExtractor(method='tfidf', max_features=5000)\n",
    "X = extractor.fit_transform(df['processed_text'])\n",
    "y = df['sentiment'].values\n",
    "\n",
    print (f\"Feature matrix shape: {X.shape}\")\n",
    "\n",
    "# Split data\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "print(f\"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Train Traditional ML Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.models import TraditionalModels\n",
    "\n",
    "# Initialize models\n",
    "models = TraditionalModels()\n",
    "\n",
    "# Train Naive Bayes\n",
    "print(\"Training Naive Bayes...\")\n",
    "nb_model, nb_metrics = models.train_naive_bayes(X_train, y_train, X_test, y_test)\n",
    "\n",
    "# Train Logistic Regression\n",
    "print(\"\\nTraining Logistic Regression...\")\n",
    "lr_model, lr_metrics = models.train_logistic_regression(X_train, y_train, X_test, y_test)\n",
    "\n",
    "# Train SVM\n",
    "print(\"\\nTraining SVM...\")\n",
    "svm_model, svm_metrics = models.train_svm(X_train, y_train, X_test, y_test)\n",
    "\n",
    "# Train Random Forest\n",
    "print(\"\\nTraining Random Forest...\")\n",
    "rf_model, rf_metrics = models.train_random_forest(X_train, y_train, X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Model Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare all models\n",
    "models.compare_all()\n",
    "\n",
    "# Get best model\n",
    "best_name, best_model = models.get_best_model()\n",
    "print(f\"\\nüèÜ Best Model: {best_name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize comparison\n",
    "results_df = pd.DataFrame({\n",
    "    'Naive Bayes': [nb_metrics['accuracy'], nb_metrics['precision'], \n",
    "                    nb_metrics['recall'], nb_metrics['f1']],\n",
    "    'Logistic Regression': [lr_metrics['accuracy'], lr_metrics['precision'],\n",
    "                           lr_metrics['recall'], lr_metrics['f1']],\n",
    "    'SVM': [svm_metrics['accuracy'], svm_metrics['precision'],\n",
    "            svm_metrics['recall'], svm_metrics['f1']],\n",
    "    'Random Forest': [rf_metrics['accuracy'], rf_metrics['precision'],\n",
    "                     rf_metrics['recall'], rf_metrics['f1']]\n",
    "}, index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])\n",
    "\n",
    "results_df.plot(kind='bar', figsize=(12, 6))\n",
    "plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')\n",
    "plt.ylabel('Score')\n",
    "plt.ylim(0, 1)\n",
    "plt.legend(loc='lower right')\n",
    "plt.xticks(rotation=0)\n",
    "plt.tight_layout()\n",
    "plt.savefig('../outputs/visualizations/model_comparison.png', dpi=150)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Detailed Evaluation - Best Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get predictions from best model\n",
    "y_pred = best_model.predict(X_test)\n",
    "\n",
    "# Classification report\n",
    "print(\"Classification Report:\")\n",
    "print(classification_report(y_test, y_pred))\n",
    "\n",
    "# Confusion matrix\n",
    "cm = confusion_matrix(y_test, y_pred)\n",
    "plt.figure(figsize=(8, 6))\n",
    "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n",
    "            xticklabels=['negative', 'neutral', 'positive'],\n",
    "            yticklabels=['negative', 'neutral', 'positive'])\n",
    "plt.title(f'Confusion Matrix - {best_name}', fontsize=14, fontweight='bold')\n",
    "plt.ylabel('True Label')\n",
    "plt.xlabel('Predicted Label')\n",
    "plt.tight_layout()\n",
    "plt.savefig('../outputs/visualizations/confusion_matrix.png', dpi=150)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform cross-validation\n",
    "cv_scores = models.cross_validate(X, y, model_name=best_name, cv=5)\n",
    "\n",
    "plt.figure(figsize=(8, 5))\n",
    "plt.bar(range(1, 6), cv_scores, color='steelblue', alpha=0.7)\n",
    "plt.axhline(y=cv_scores.mean(), color='r', linestyle='--', \n",
    "            label=f'Mean: {cv_scores.mean():.3f}')\n",
    "plt.xlabel('Fold')\n",
    "plt.ylabel('F1 Score')\n",
    "plt.title(f'5-Fold Cross-Validation - {best_name}', fontsize=14, fontweight='bold')\n",
    "plt.legend()\n",
    "plt.ylim(0, 1)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"CV Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Train LSTM (Deep Learning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "tf.config.set_visible_devices([], 'GPU')  # Disable GPU for notebook\n",
    "\n",
    "from src.models import LSTMModel\n",
    "\n",
    "# Prepare sequences\n",
    "lstm = LSTMModel(max_words=5000, max_len=50, embedding_dim=128)\n",
    "X_lstm, y_lstm = lstm.prepare_sequences(df['processed_text'], df['sentiment'])\n",
    "\n",
    "# Split\n",
    "X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(\n",
    "    X_lstm, y_lstm, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "print(f\"Sequence shape: {X_train_l.shape}\")\n",
    "print(f\"Classes: {lstm.classes}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build and train LSTM\n",
    "lstm.build_model(num_classes=len(lstm.classes), architecture='bidirectional')\n",
    "\n",
    "history = lstm.train(\n",
    "    X_train_l, y_train_l,\n",
    "    X_test_l, y_test_l,\n",
    "    epochs=10,\n",
    "    batch_size=32\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot training history\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
    "\n",
    "# Accuracy\n",
    "axes[0].plot(history.history['accuracy'], label='Train')\n",
    "axes[0].plot(history.history['val_accuracy'], label='Validation')\n",
    "axes[0].set_title('Model Accuracy')\n",
    "axes[0].set_xlabel('Epoch')\n",
    "axes[0].set_ylabel('Accuracy')\n",
    "axes[0].legend()\n",
    "axes[0].grid(True, alpha=0.3)\n",
    "\n",
    "# Loss\n",
    "axes[1].plot(history.history['loss'], label='Train')\n",
    "axes[1].plot(history.history['val_loss'], label='Validation')\n",
    "axes[1].set_title('Model Loss')\n",
    "axes[1].set_xlabel('Epoch')\n",
    "axes[1].set_ylabel('Loss')\n",
    "axes[1].legend()\n",
    "axes[1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('../outputs/visualizations/lstm_training.png', dpi=150)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate LSTM\n",
    "lstm_metrics = lstm.evaluate(X_test_l, y_test_l)\n",
    "\n",
    "# Compare with traditional models\n",
    "comparison = pd.DataFrame({\n",
    "    'Traditional ML (Best)': [lr_metrics['accuracy'], lr_metrics['f1']],\n",
    '    'LSTM Deep Learning': [lstm_metrics['accuracy'], \n",
    "                          2 * (lstm_metrics['precision'] * lstm_metrics['recall']) / \n",
    "                          (lstm_metrics['precision'] + lstm_metrics['recall'])]\n",
    "}, index=['Accuracy', 'F1-Score'])\n",
    "\n",
    "comparison.plot(kind='bar', figsize=(8, 5))\n",
    "plt.title('Traditional ML vs Deep Learning', fontsize=14, fontweight='bold')\n",
    "plt.ylabel('Score')\n",
    "plt.ylim(0, 1)\n",
    "plt.xticks(rotation=0)\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(comparison)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Save Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import os\n",
    "\n",
    "# Create models directory\n",
    "os.makedirs('../models', exist_ok=True)\n",
    "\n",
    "# Save best traditional model\n",
    "with open(f'../models/{best_name}.pkl', 'wb') as f:\n",
    "    pickle.dump(best_model, f)\n",
    "\n",
    "# Save preprocessor and extractor\n",
    "with open('../models/preprocessor.pkl', 'wb') as f:\n",
    "    pickle.dump(preprocessor, f)\n",
    "\n",
    "extractor.save('../models/feature_extractor.pkl')\n",
    "\n",
    "# Save LSTM\n",
    "lstm.save('../models/lstm_model.h5', '../models/lstm_tokenizer.pkl')\n",
    "\n",
    "print(\"‚úÖ All models saved!\")\n",
    "print(\"\\nSaved files:\")\n",
    "for f in os.listdir('../models'):\n",
    "    print(f\"  - {f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"=\"*60)\n",
    "print(\"TRAINING SUMMARY\")\n",
    "print(\"=\"*60)\n",
    print(f\"Best Traditional Model: {best_name}\")\n",
    "print(f\"  Accuracy:  {lr_metrics['accuracy']:.4f}\")\n",
    "print(f\"  F1-Score:  {lr_metrics['f1']:.4f}\")\n",
    "print(f\"\\nLSTM Deep Learning:\")\n",
    "print(f\"  Accuracy:  {lstm_metrics['accuracy']:.4f}\")\n",
    "print(f\"  Precision: {lstm_metrics['precision']:.4f}\")\n",
    "print(f\"  Recall:    {lstm_metrics['recall']:.4f}\")\n",
    "print(\"=\"*60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}