In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploratory Data Analysis for Amazon Fake Review Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "import nltk\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "# Download NLTK resources\n",
    "nltk.download('vader_lexicon')\n",
    "nltk.download('stopwords')\n",
    "\n",
    "# Set plot style\n",
    "plt.style.use('ggplot')\n",
    "sns.set(style=\"whitegrid\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the dataset\n",
    "df = pd.read_csv('../data/amazon_reviews.csv')\n",
    "\n",
    "# Display basic information\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values\n",
    "missing_values = df.isnull().sum()\n",
    "print(\"Missing values per column:\")\n",
    "print(missing_values[missing_values > 0])\n",
    "\n",
    "# Check label distribution\n",
    "label_counts = df['LABEL'].value_counts()\n",
    "print(\"\\nLabel distribution:\")\n",
    "print(label_counts)\n",
    "\n",
    "# Visualize label distribution\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.countplot(x='LABEL', data=df)\n",
    "plt.title('Distribution of Fake vs. Genuine Reviews')\n",
    "plt.xlabel('Label (0=Genuine, 1=Fake)')\n",
    "plt.ylabel('Count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Review Length Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add review length\n",
    "df['review_length'] = df['REVIEW_TEXT'].fillna('').astype(str).apply(len)\n",
    "df['word_count'] = df['REVIEW_TEXT'].fillna('').astype(str).apply(lambda x: len(x.split()))\n",
    "\n",
    "# Compare review length between fake and genuine reviews\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='LABEL', y='review_length', data=df)\n",
    "plt.title('Review Length by Label')\n",
    "plt.xlabel('Label (0=Genuine, 1=Fake)')\n",
    "plt.ylabel('Review Length (characters)')\n",
    "plt.show()\n",
    "\n",
    "# Compare word count between fake and genuine reviews\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='LABEL', y='word_count', data=df)\n",
    "plt.title('Word Count by Label')\n",
    "plt.xlabel('Label (0=Genuine, 1=Fake)')\n",
    "plt.ylabel('Word Count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rating Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze rating distribution\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.countplot(x='RATING', hue='LABEL', data=df)\n",
    "plt.title('Rating Distribution by Label')\n",
    "plt.xlabel('Rating')\n",
    "plt.ylabel('Count')\n",
    "plt.legend(title='Label', labels=['Genuine', 'Fake'])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sentiment Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize sentiment analyzer\n",
    "sia = SentimentIntensityAnalyzer()\n",
    "\n",
    "# Calculate sentiment scores\n",
    "df['sentiment_score'] = df['REVIEW_TEXT'].fillna('').astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])\n",
    "\n",
    "# Compare sentiment between fake and genuine reviews\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='LABEL', y='sentiment_score', data=df)\n",
    "plt.title('Sentiment Score by Label')\n",
    "plt.xlabel('Label (0=Genuine, 1=Fake)')\n",
    "plt.ylabel('Sentiment Score (-1 to 1)')\n",
    "plt.show()\n",
    "\n",
    "# Analyze sentiment vs. rating\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x='RATING', y='sentiment_score', hue='LABEL', data=df)\n",
    "plt.title('Sentiment Score by Rating and Label')\n",
    "plt.xlabel('Rating')\n",
    "plt.ylabel('Sentiment Score (-1 to 1)')\n",
    "plt.legend(title='Label', labels=['Genuine', 'Fake'])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Verified Purchase Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze verified purchase distribution\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.countplot(x='VERIFIED_PURCHASE', hue='LABEL', data=df)\n",
    "plt.title('Verified Purchase Distribution by Label')\n",
    "plt.xlabel('Verified Purchase (Y/N)')\n",
    "plt.ylabel('Count')\n",
    "plt.legend(title='Label', labels=['Genuine', 'Fake'])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Word Clouds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from wordcloud import WordCloud\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "# Get stopwords\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "# Function to create wordcloud\n",
    "def create_wordcloud(text, title):\n",
    "    wordcloud = WordCloud(width=800, height=400, background_color='white', \n",
    "                          stopwords=stop_words, max_words=100).generate(text)\n",
    "    \n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.imshow(wordcloud, interpolation='bilinear')\n",
    "    plt.axis('off')\n",
    "    plt.title(title)\n",
    "    plt.show()\n",
    "\n",
    "# Create word clouds for genuine and fake reviews\n",
    "genuine_text = ' '.join(df[df['LABEL'] == 0]['REVIEW_TEXT'].fillna('').astype(str))\n",
    "fake_text = ' '.join(df[df['LABEL'] == 1]['REVIEW_TEXT'].fillna('').astype(str))\n",
    "\n",
    "create_wordcloud(genuine_text, 'Word Cloud for Genuine Reviews')\n",
    "create_wordcloud(fake_text, 'Word Cloud for Fake Reviews')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add more features for correlation analysis\n",
    "import string\n",
    "\n",
    "df['capital_letters'] = df['REVIEW_TEXT'].fillna('').astype(str).apply(lambda x: sum(1 for c in x if c.isupper()))\n",
    "df['punctuation_count'] = df['REVIEW_TEXT'].fillna('').astype(str).apply(lambda x: sum(1 for c in x if c in string.punctuation))\n",
    "df['exclamation_count'] = df['REVIEW_TEXT'].fill
   ]
  }