In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ðŸ“Š Sentiment Analysis - Data Exploration\n",
    "\n",
    "This notebook explores the dataset, visualizes distributions, and prepares data for modeling.\n",
    "\n",
    "## Objectives:\n",
    "- Load and inspect data\n",
    "- Visualize sentiment distributions\n",
    "- Analyze text statistics\n",
    "- Explore word frequencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from collections import Counter\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set style\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "sns.set_palette(\"husl\")\n",
    "\n",
    "print(\"âœ… Libraries imported successfully\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.data_loader import DataLoader\n",
    "\n",
    "# Initialize loader\n",
    "loader = DataLoader()\n",
    "\n",
    "# Load Twitter dataset (or change to IMDB/Amazon)\n",
    "df = loader.load_twitter_data()\n",
    "\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"\\nColumns: {df.columns.tolist()}\")\n",
    "print(f\"\\nFirst few rows:\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Basic Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset info\n",
    "print(\"Dataset Info:\")\n",
    "print(f\"Total samples: {len(df)}\")\n",
    "print(f\"Missing values: {df.isnull().sum().sum()}\")\n",
    "print(f\"Duplicate rows: {df.duplicated().sum()}\")\n",
    "\n",
    "# Sentiment distribution\n",
    "print(\"\\nSentiment Distribution:\")\n",
    "sentiment_counts = df['sentiment'].value_counts()\n",
    "print(sentiment_counts)\n",
    "print(f\"\\nPercentages:\")\n",
    "print(df['sentiment'].value_counts(normalize=True) * 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualize Distributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# Bar plot\n",
    "sentiment_counts.plot(kind='bar', ax=axes[0], color=['#28a745', '#dc3545', '#ffc107'])\n",
    "axes[0].set_title('Sentiment Distribution (Count)', fontsize=14, fontweight='bold')\n",
    "axes[0].set_xlabel('Sentiment')\n",
    "axes[0].set_ylabel('Count')\n",
    "axes[0].tick_params(axis='x', rotation=0)\n",
    "\n",
    "# Pie chart\n",
    "colors = ['#28a745', '#dc3545', '#ffc107']\n",
    "axes[1].pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%',\n",
    "            colors=colors, startangle=90)\n",
    "axes[1].set_title('Sentiment Distribution (%)', fontsize=14, fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('../outputs/visualizations/sentiment_distribution.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Text Length Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate text lengths\n",
    "df['text_length'] = df['text'].str.len()\n",
    "df['word_count'] = df['text'].str.split().str.len()\n",
    "\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "\n",
    "# Text length distribution\n",
    "df['text_length'].hist(bins=50, ax=axes[0,0], color='skyblue', edgecolor='black')\n",
    "axes[0,0].set_title('Distribution of Text Length (Characters)')\n",
    "axes[0,0].set_xlabel('Length')\n",
    "axes[0,0].set_ylabel('Frequency')\n",
    "\n",
    "# Word count distribution\n",
    "df['word_count'].hist(bins=50, ax=axes[0,1], color='lightcoral', edgecolor='black')\n",
    "axes[0,1].set_title('Distribution of Word Count')\n",
    "axes[0,1].set_xlabel('Word Count')\n",
    "axes[0,1].set_ylabel('Frequency')\n",
    "\n",
    "# Box plot by sentiment\n",
    "df.boxplot(column='text_length', by='sentiment', ax=axes[1,0])\n",
    "axes[1,0].set_title('Text Length by Sentiment')\n",
    "axes[1,0].set_xlabel('Sentiment')\n",
    "\n",
    "df.boxplot(column='word_count', by='sentiment', ax=axes[1,1])\n",
    "axes[1,1].set_title('Word Count by Sentiment')\n",
    "axes[1,1].set_xlabel('Sentiment')\n",
    "\n",
    "plt.suptitle('')  # Remove automatic title\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Statistics\n",
    "print(\"Text Length Statistics by Sentiment:\")\n",
    "print(df.groupby('sentiment')['text_length'].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Word Frequency Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.preprocess import TextPreprocessor\n",
    "\n",
    "# Preprocess texts\n",
    "preprocessor = TextPreprocessor(remove_stopwords=True, lemmatize=True)\n",
    "df['processed'] = df['text'].apply(preprocessor.preprocess)\n",
    "\n",
    "# Get word frequencies\n",
    "all_words = ' '.join(df['processed']).split()\n",
    "word_freq = Counter(all_words)\n",
    "\n",
    "print(f\"Total words: {len(all_words)}\")\n",
    "print(f\"Unique words: {len(word_freq)}\")\n",
    "print(f\"\\nTop 20 most common words:\")\n",
    "for word, count in word_freq.most_common(20):\n",
    "    print(f\"  {word}: {count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize top words\n",
    "top_words = dict(word_freq.most_common(20))\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.bar(top_words.keys(), top_words.values(), color='steelblue')\n",
    "plt.title('Top 20 Most Frequent Words', fontsize=14, fontweight='bold')\n",
    "plt.xlabel('Words')\n",
    "plt.ylabel('Frequency')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.tight_layout()\n",
    "plt.savefig('../outputs/visualizations/top_words.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Word Cloud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wordcloud import WordCloud\n",
    "\n",
    "# Generate word cloud\n",
    "wordcloud = WordCloud(width=800, height=400, \n",
    "                      background_color='white',\n",
    "                      max_words=100).generate(' '.join(df['processed']))\n",
    "\n",
    "plt.figure(figsize=(15, 7))\n",
    "plt.imshow(wordcloud, interpolation='bilinear')\n",
    "plt.axis('off')\n",
    "plt.title('Word Cloud of Processed Text', fontsize=16, fontweight='bold')\n",
    "plt.tight_layout()\n",
    "plt.savefig('../outputs/visualizations/wordcloud.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Sentiment by Text Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create length categories\n",
    "df['length_category'] = pd.cut(df['text_length'], \n",
    "                               bins=[0, 50, 100, 200, float('inf')],\n",
    "                               labels=['Very Short', 'Short', 'Medium', 'Long'])\n",
    "\n",
    "# Cross-tabulation\n",
    "crosstab = pd.crosstab(df['length_category'], df['sentiment'], normalize='index') * 100\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "crosstab.plot(kind='bar', stacked=True, color=['#28a745', '#dc3545', '#ffc107'])\n",
    "plt.title('Sentiment Distribution by Text Length Category', fontsize=14, fontweight='bold')\n",
    "plt.xlabel('Length Category')\n",
    "plt.ylabel('Percentage')\n",
    "plt.legend(title='Sentiment')\n",
    "plt.xticks(rotation=0)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"\\nCross-tabulation (counts):\")\n",
    "print(pd.crosstab(df['length_category'], df['sentiment']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Save Processed Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save processed dataframe\n",
    "output_path = '../data/processed_data.csv'\n",
    "df.to_csv(output_path, index=False)\n",
    "print(f\"âœ… Processed data saved to: {output_path}\")\n",
    "\n",
    "# Summary\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"DATA EXPLORATION SUMMARY\")\n",
    "print(\"=\"*60)\n",
    "print(f\"Total samples: {len(df)}\")\n",
    "print(f\"Sentiment distribution: {dict(sentiment_counts)}\")\n",
    "print(f\"Average text length: {df['text_length'].mean():.1f} characters\")\n",
    "print(f\"Average word count: {df['word_count'].mean():.1f} words\")\n",
    "print(f\"Vocabulary size: {len(word_freq)} unique words\")\n",
    "print(\"=\"*60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}