# üöÄ Product Intelligence Engine - Data Exploration

Notebook ini digunakan untuk eksplorasi data hasil scraping dan processing.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('‚úÖ Libraries imported successfully!')

## üìÇ Load Data

In [None]:
# Define paths
DATA_DIR = Path('../data')
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'

# List available files
print('üìÅ Available files:')
print('\nRaw data files:')
for f in RAW_DIR.glob('*.csv'):
    print(f'   - {f.name}')

print('\nProcessed data files:')
for f in PROCESSED_DIR.glob('*.csv'):
    print(f'   - {f.name}')

In [None]:
# Load the latest processed data
processed_files = list(PROCESSED_DIR.glob('*.csv'))

if processed_files:
    latest_file = max(processed_files, key=lambda x: x.stat().st_ctime)
    df = pd.read_csv(latest_file)
    print(f'‚úÖ Loaded: {latest_file.name}')
    print(f'üìä Shape: {df.shape}')
    print(f'\nColumns: {list(df.columns)}')
else:
    print('‚ùå No processed data found. Please run the pipeline first.')
    df = pd.DataFrame()

In [None]:
# Quick overview
if not df.empty:
    display(df.head())
    print('\nüìã Data Info:')
    df.info()

## üìä Data Analysis

In [None]:
# Basic statistics
if not df.empty:
    print('üìà Basic Statistics:\n')
    print(f'Total Reviews: {len(df):,}')
    print(f'Average Rating: {df["rating"].mean():.2f}/5.0')
    print(f'Median Rating: {df["rating"].median():.1f}')
    print(f'Rating Std Dev: {df["rating"].std():.2f}')
    
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        print(f'\nDate Range: {df["date"].min().date()} to {df["date"].max().date()}')

### üìä Category Distribution

In [None]:
if not df.empty and 'category' in df.columns:
    # Category distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Bar chart
    category_counts = df['category'].value_counts()
    category_counts.plot(kind='barh', ax=ax1, color='skyblue')
    ax1.set_title('üìä Category Distribution', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Count')
    ax1.set_ylabel('Category')
    
    # Pie chart
    category_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
    ax2.set_title('üìä Category Percentage', fontsize=14, fontweight='bold')
    ax2.set_ylabel('')
    
    plt.tight_layout()
    plt.show()
    
    print('\nüìã Category Breakdown:')
    for cat, count in category_counts.items():
        pct = (count / len(df)) * 100
        print(f'   {cat}: {count} ({pct:.1f}%)')

### üòä Sentiment Analysis

In [None]:
if not df.empty and 'sentiment' in df.columns:
    # Sentiment distribution
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    sentiment_counts = df['sentiment'].value_counts()
    colors = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
    sentiment_colors = [colors.get(s, 'gray') for s in sentiment_counts.index]
    
    # Bar chart
    sentiment_counts.plot(kind='bar', ax=axes[0], color=sentiment_colors)
    axes[0].set_title('üòä Sentiment Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Sentiment')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Pie chart
    sentiment_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=sentiment_colors)
    axes[1].set_title('üòä Sentiment Percentage', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()

### ‚≠ê Rating Analysis

In [None]:
if not df.empty and 'rating' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Rating distribution
    rating_counts = df['rating'].value_counts().sort_index()
    rating_counts.plot(kind='bar', ax=axes[0], color='gold')
    axes[0].set_title('‚≠ê Rating Distribution', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Rating')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=0)
    
    # Box plot
    df.boxplot(column='rating', ax=axes[1])
    axes[1].set_title('‚≠ê Rating Box Plot', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Rating')
    
    plt.tight_layout()
    plt.show()

### ‚ö†Ô∏è Priority Analysis

In [None]:
if not df.empty and 'priority' in df.columns:
    priority_counts = df['priority'].value_counts()
    colors_priority = {'high': '#e74c3c', 'medium': '#f39c12', 'low': '#2ecc71'}
    priority_colors = [colors_priority.get(p, 'gray') for p in priority_counts.index]
    
    plt.figure(figsize=(10, 6))
    priority_counts.plot(kind='bar', color=priority_colors)
    plt.title('‚ö†Ô∏è Priority Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Priority Level')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## üîç Deep Dive Analysis

In [None]:
# Category by Sentiment
if not df.empty and 'category' in df.columns and 'sentiment' in df.columns:
    category_sentiment = pd.crosstab(df['category'], df['sentiment'])
    
    plt.figure(figsize=(12, 6))
    category_sentiment.plot(kind='bar', stacked=True, 
                           color=['#2ecc71', '#f39c12', '#e74c3c'],
                           figsize=(12, 6))
    plt.title('üìä Category vs Sentiment', fontsize=14, fontweight='bold')
    plt.xlabel('Category')
    plt.ylabel('Count')
    plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
# Top issues by category
if not df.empty and 'category' in df.columns and 'priority' in df.columns:
    print('üî¥ High Priority Issues by Category:\n')
    high_priority = df[df['priority'] == 'high']
    
    if not high_priority.empty:
        category_high_priority = high_priority['category'].value_counts()
        for cat, count in category_high_priority.items():
            print(f'{cat}: {count} issues')
        
        # Show sample high priority issues
        print('\nüìã Sample High Priority Issues:')
        display(high_priority[['category', 'rating', 'summary']].head(10))

## üí° Export Insights

In [None]:
# Generate summary report
if not df.empty:
    summary = {
        'Total Reviews': len(df),
        'Average Rating': round(df['rating'].mean(), 2),
        'Positive Reviews': len(df[df['sentiment'] == 'positive']) if 'sentiment' in df.columns else 'N/A',
        'Negative Reviews': len(df[df['sentiment'] == 'negative']) if 'sentiment' in df.columns else 'N/A',
        'High Priority Issues': len(df[df['priority'] == 'high']) if 'priority' in df.columns else 'N/A',
        'Most Common Category': df['category'].mode()[0] if 'category' in df.columns else 'N/A',
    }
    
    print('üìä Summary Report:')
    print('=' * 50)
    for key, value in summary.items():
        print(f'{key}: {value}')
    print('=' * 50)

In [None]:
# Save insights for dashboard
if not df.empty:
    # Create aggregated data for Looker Studio
    insights_file = PROCESSED_DIR / 'insights_summary.csv'
    
    # Category summary
    category_summary = df.groupby('category').agg({
        'rating': ['count', 'mean'],
        'priority': lambda x: (x == 'high').sum()
    }).reset_index()
    
    category_summary.columns = ['category', 'total_reviews', 'avg_rating', 'high_priority_count']
    category_summary.to_csv(insights_file, index=False)
    
    print(f'‚úÖ Insights saved to: {insights_file}')