# Exploratory Data Analysis & Visualization - AI/ML Market Analysis

This notebook provides comprehensive visual analysis and insights from the AI market datasets using advanced visualizations.

## Objectives:
- Perform detailed correlation analysis
- Identify trends and patterns over time
- Analyze regional patterns and differences
- Conduct statistical tests and hypothesis validation
- Create interactive visualizations for exploration
- Generate business insights and recommendations

## 1. Import Libraries and Load Data

In [None]:
# Import comprehensive visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

# Utilities
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Load processed datasets
processed_dir = Path('../data/processed')
market_df = pd.read_csv(processed_dir / 'ai_market_engineered.csv')
popularity_df = pd.read_csv(processed_dir / 'ai_popularity_clean.csv')

print("✅ Data loaded for EDA!")
print(f"Market data (engineered): {market_df.shape}")
print(f"Popularity data: {popularity_df.shape}")

## 2. Market Evolution Timeline Analysis

In [None]:
# Create comprehensive market evolution timeline
fig = make_subplots(
    rows=3, cols=2,
    subplot_titles=(
        'AI Revenue Growth (2018-2025)',
        'Market Value Explosion', 
        'Adoption Rate Progression',
        'Job Market Transformation',
        'Organizational AI Integration',
        'Technology Impact Metrics'
    ),
    vertical_spacing=0.08,
    horizontal_spacing=0.1
)

# 1. AI Revenue Growth
if 'ai_software_revenue_in_billions' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['ai_software_revenue_in_billions'],
            mode='lines+markers',
            name='AI Revenue',
            line=dict(color='#1f77b4', width=3),
            marker=dict(size=8)
        ),
        row=1, col=1
    )

# 2. Market Value
if 'global_ai_market_value_in_billions' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['global_ai_market_value_in_billions'],
            mode='lines+markers',
            name='Market Value',
            line=dict(color='#ff7f0e', width=3),
            marker=dict(size=8)
        ),
        row=1, col=2
    )

# 3. Adoption Rate
if 'ai_adoption' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['ai_adoption'],
            mode='lines+markers',
            name='Adoption Rate',
            line=dict(color='#2ca02c', width=3),
            marker=dict(size=8)
        ),
        row=2, col=1
    )

# 4. Job Impact
if 'estimated_jobs_eliminated_by_ai_millions' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['estimated_jobs_eliminated_by_ai_millions'],
            mode='lines+markers',
            name='Jobs Eliminated',
            line=dict(color='#d62728', width=3)
        ),
        row=2, col=2
    )
    
if 'estimated_new_jobs_created_by_ai_millions' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['estimated_new_jobs_created_by_ai_millions'],
            mode='lines+markers',
            name='Jobs Created',
            line=dict(color='#9467bd', width=3)
        ),
        row=2, col=2
    )

# 5. Organizational Metrics
if 'organizations_using_ai' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['organizations_using_ai'],
            mode='lines+markers',
            name='Orgs Using AI',
            line=dict(color='#8c564b', width=3)
        ),
        row=3, col=1
    )

# 6. Technology Impact
if 'americans_using_voice_assistants' in market_df.columns:
    fig.add_trace(
        go.Scatter(
            x=market_df['year'],
            y=market_df['americans_using_voice_assistants'],
            mode='lines+markers',
            name='Voice Assistant Usage',
            line=dict(color='#e377c2', width=3)
        ),
        row=3, col=2
    )

fig.update_layout(
    height=900,
    title_text="AI Market Evolution Timeline (2018-2025)",
    title_x=0.5,
    showlegend=False
)

fig.show()

## 3. Advanced Correlation Analysis

In [None]:
# Advanced correlation analysis
def advanced_correlation_analysis(df, method='pearson'):
    """
    Perform comprehensive correlation analysis
    """
    # Select numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Calculate correlation matrix
    if method == 'pearson':
        corr_matrix = numeric_df.corr(method='pearson')
    elif method == 'spearman':
        corr_matrix = numeric_df.corr(method='spearman')
    else:
        corr_matrix = numeric_df.corr(method='kendall')
    
    return corr_matrix

# Calculate different correlation matrices
corr_pearson = advanced_correlation_analysis(market_df, 'pearson')
corr_spearman = advanced_correlation_analysis(market_df, 'spearman')

# Create interactive correlation heatmap
fig = go.Figure(data=go.Heatmap(
    z=corr_pearson.values,
    x=corr_pearson.columns,
    y=corr_pearson.columns,
    colorscale='RdBu',
    zmid=0,
    text=np.round(corr_pearson.values, 2),
    texttemplate="%{text}",
    textfont={"size": 8},
    hoverongaps=False
))

fig.update_layout(
    title='AI Market Data - Correlation Matrix (Pearson)',
    width=1000,
    height=800,
    xaxis_tickangle=-45
)

fig.show()

# Find and display strongest correlations
def find_strong_correlations(corr_matrix, threshold=0.7):
    """
    Find correlations above threshold
    """
    strong_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > threshold and not np.isnan(corr_val):
                strong_corr.append({
                    'Variable_1': corr_matrix.columns[i],
                    'Variable_2': corr_matrix.columns[j],
                    'Correlation': corr_val,
                    'Abs_Correlation': abs(corr_val)
                })
    
    return pd.DataFrame(strong_corr).sort_values('Abs_Correlation', ascending=False)

strong_correlations = find_strong_correlations(corr_pearson, threshold=0.7)

print("🔗 STRONG CORRELATIONS (|r| > 0.7):")
print("=" * 60)
if not strong_correlations.empty:
    for idx, row in strong_correlations.head(10).iterrows():
        var1 = row['Variable_1'][:30] + '...' if len(row['Variable_1']) > 30 else row['Variable_1']
        var2 = row['Variable_2'][:30] + '...' if len(row['Variable_2']) > 30 else row['Variable_2']
        print(f"   {var1} ↔ {var2}: {row['Correlation']:.3f}")
else:
    print("   No strong correlations found above threshold")

## 4. Time Series Decomposition and Trend Analysis

In [None]:
# Time series decomposition for key metrics
def analyze_time_series_components(df, target_col, year_col='year'):
    """
    Decompose time series into trend, seasonal, and residual components
    """
    if target_col not in df.columns or year_col not in df.columns:
        print(f"❌ Columns not found: {target_col} or {year_col}")
        return None
    
    # Create time series
    ts_data = df[[year_col, target_col]].dropna()
    ts_data = ts_data.set_index(year_col)[target_col]
    
    if len(ts_data) < 4:
        print(f"⚠️ Not enough data points for decomposition: {len(ts_data)}")
        return None
    
    # Perform decomposition (use additive for trend analysis)
    try:
        decomposition = seasonal_decompose(ts_data, model='additive', period=min(4, len(ts_data)//2))
        
        # Create visualization
        fig, axes = plt.subplots(4, 1, figsize=(15, 12))
        
        # Original data
        decomposition.observed.plot(ax=axes[0], title=f'Original - {target_col.replace("_", " ").title()}')
        axes[0].grid(True, alpha=0.3)
        
        # Trend
        decomposition.trend.plot(ax=axes[1], title='Trend Component', color='red')
        axes[1].grid(True, alpha=0.3)
        
        # Seasonal
        decomposition.seasonal.plot(ax=axes[2], title='Seasonal Component', color='green')
        axes[2].grid(True, alpha=0.3)
        
        # Residual
        decomposition.resid.plot(ax=axes[3], title='Residual Component', color='purple')
        axes[3].grid(True, alpha=0.3)
        
        plt.suptitle(f'Time Series Decomposition: {target_col.replace("_", " ").title()}', 
                    fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        return decomposition
    
    except Exception as e:
        print(f"⚠️ Decomposition failed for {target_col}: {e}")
        return None

# Analyze key time series
key_metrics = [
    'ai_software_revenue_in_billions',
    'global_ai_market_value_in_billions',
    'ai_adoption'
]

decompositions = {}
for metric in key_metrics:
    if metric in market_df.columns:
        print(f"\n📈 Analyzing {metric.replace('_', ' ').title()}...")
        decomp = analyze_time_series_components(market_df, metric)
        if decomp is not None:
            decompositions[metric] = decomp

## 5. Growth Rate and Momentum Analysis

In [None]:
# Comprehensive growth analysis
growth_cols = [col for col in market_df.columns if 'growth' in col]

if growth_cols:
    # Create growth comparison chart
    fig = go.Figure()
    
    colors = px.colors.qualitative.Set3
    
    for i, col in enumerate(growth_cols[:6]):  # Show top 6 growth metrics
        if not market_df[col].isna().all():
            fig.add_trace(
                go.Scatter(
                    x=market_df['year'],
                    y=market_df[col],
                    mode='lines+markers',
                    name=col.replace('_', ' ').title(),
                    line=dict(color=colors[i % len(colors)], width=2)
                )
            )
    
    fig.update_layout(
        title='Growth Rates Comparison Over Time',
        xaxis_title='Year',
        yaxis_title='Growth Rate (%)',
        height=600,
        hovermode='x unified'
    )
    
    fig.show()
    
    # Growth statistics
    print("📊 GROWTH RATE STATISTICS:")
    print("=" * 50)
    for col in growth_cols[:5]:
        if not market_df[col].isna().all():
            mean_growth = market_df[col].mean()
            std_growth = market_df[col].std()
            max_growth = market_df[col].max()
            min_growth = market_df[col].min()
            
            col_display = col.replace('_', ' ').title()[:40]
            print(f"📈 {col_display}:")
            print(f"   Mean: {mean_growth:.1f}%, Std: {std_growth:.1f}%")
            print(f"   Range: [{min_growth:.1f}%, {max_growth:.1f}%]")
else:
    print("⚠️ No growth features found in the dataset")

## 6. Regional Analysis and Global Patterns

In [None]:
# Analyze global AI popularity patterns
def analyze_regional_patterns(df):
    """
    Analyze regional patterns in AI popularity
    """
    print("🌍 REGIONAL AI POPULARITY ANALYSIS")
    print("=" * 50)
    
    # Countries with highest AI interest
    country_col = None
    popularity_col = None
    
    # Find relevant columns
    for col in df.columns:
        if 'country' in col.lower():
            country_col = col
        elif 'popularity' in col.lower() and df[col].dtype in ['int64', 'float64']:
            popularity_col = col
            break
    
    if country_col and popularity_col:
        # Clean data for analysis
        regional_data = df[[country_col, popularity_col]].dropna()
        regional_data = regional_data[regional_data[country_col] != '']
        
        if len(regional_data) > 0:
            # Top countries by AI popularity
            top_countries = regional_data.nlargest(15, popularity_col)
            
            # Create bar chart
            fig = px.bar(
                top_countries,
                x=popularity_col,
                y=country_col,
                orientation='h',
                title='Top 15 Countries by AI Popularity',
                labels={popularity_col: 'AI Popularity Score', country_col: 'Country'},
                color=popularity_col,
                color_continuous_scale='viridis'
            )
            
            fig.update_layout(height=600, yaxis={'categoryorder': 'total ascending'})
            fig.show()
            
            print(f"\n🏆 TOP 10 COUNTRIES BY AI POPULARITY:")
            for i, (idx, row) in enumerate(top_countries.head(10).iterrows(), 1):
                country = row[country_col]
                score = row[popularity_col]
                print(f"{i:2d}. {country}: {score}")
            
            return regional_data
    
    print("⚠️ Could not find country and popularity columns for regional analysis")
    return None

# Perform regional analysis
regional_data = analyze_regional_patterns(popularity_df)

## 7. Statistical Hypothesis Testing

In [None]:
# Statistical hypothesis testing
def perform_hypothesis_tests(df):
    """
    Perform various statistical tests on the data
    """
    print("🧪 STATISTICAL HYPOTHESIS TESTING")
    print("=" * 50)
    
    results = {}
    
    # Test 1: Is there a significant trend in AI revenue?
    if 'ai_software_revenue_in_billions' in df.columns and 'year' in df.columns:
        revenue_data = df[['year', 'ai_software_revenue_in_billions']].dropna()
        if len(revenue_data) > 3:
            slope, intercept, r_value, p_value, std_err = stats.linregress(
                revenue_data['year'], revenue_data['ai_software_revenue_in_billions']
            )
            
            print(f"📈 AI Revenue Trend Test:")
            print(f"   Slope: {slope:.2f} billion/year")
            print(f"   R²: {r_value**2:.3f}")
            print(f"   P-value: {p_value:.6f}")
            print(f"   Significant trend: {'Yes' if p_value < 0.05 else 'No'}")
            
            results['revenue_trend'] = {
                'slope': slope,
                'r_squared': r_value**2,
                'p_value': p_value,
                'significant': p_value < 0.05
            }
    
    # Test 2: Correlation between adoption and revenue
    if 'ai_adoption' in df.columns and 'ai_software_revenue_in_billions' in df.columns:
        adoption_revenue = df[['ai_adoption', 'ai_software_revenue_in_billions']].dropna()
        if len(adoption_revenue) > 3:
            corr_coef, p_value = pearsonr(
                adoption_revenue['ai_adoption'], 
                adoption_revenue['ai_software_revenue_in_billions']
            )
            
            print(f"\n🔗 Adoption-Revenue Correlation Test:")
            print(f"   Correlation: {corr_coef:.3f}")
            print(f"   P-value: {p_value:.6f}")
            print(f"   Significant correlation: {'Yes' if p_value < 0.05 else 'No'}")
            
            results['adoption_revenue_corr'] = {
                'correlation': corr_coef,
                'p_value': p_value,
                'significant': p_value < 0.05
            }
    
    # Test 3: Job creation vs elimination balance
    if 'estimated_new_jobs_created_by_ai_millions' in df.columns and 'estimated_jobs_eliminated_by_ai_millions' in df.columns:
        jobs_data = df[['estimated_new_jobs_created_by_ai_millions', 'estimated_jobs_eliminated_by_ai_millions']].dropna()
        if len(jobs_data) > 3:
            # Paired t-test
            t_stat, p_value = stats.ttest_rel(
                jobs_data['estimated_new_jobs_created_by_ai_millions'],
                jobs_data['estimated_jobs_eliminated_by_ai_millions']
            )
            
            print(f"\n👥 Job Creation vs Elimination Test:")
            print(f"   T-statistic: {t_stat:.3f}")
            print(f"   P-value: {p_value:.6f}")
            print(f"   Job creation > elimination: {'Yes' if t_stat > 0 and p_value < 0.05 else 'Inconclusive'}")
            
            results['job_balance_test'] = {
                't_statistic': t_stat,
                'p_value': p_value,
                'jobs_creation_higher': t_stat > 0 and p_value < 0.05
            }
    
    return results

# Perform hypothesis tests
test_results = perform_hypothesis_tests(market_df)

print(f"\n📋 Statistical tests completed: {len(test_results)} tests")

## 8. Interactive Visualization Dashboard

In [None]:
# Create interactive dashboard-style visualization
def create_interactive_dashboard(df):
    """
    Create comprehensive interactive dashboard
    """
    # Key metrics cards (simulated)
    if 'year' in df.columns:
        latest_year = df['year'].max()
        latest_data = df[df['year'] == latest_year].iloc[0]
        
        print(f"📊 AI MARKET DASHBOARD - {int(latest_year)}")
        print("=" * 60)
        
        # Key metrics display
        metrics = {
            'AI Software Revenue': ('ai_software_revenue_in_billions', 'B'),
            'Market Value': ('global_ai_market_value_in_billions', 'B'),
            'Adoption Rate': ('ai_adoption', '%'),
            'Organizations Using AI': ('organizations_using_ai', '%'),
            'Expected Revenue Increase': ('estimated_revenue_increase_from_ai_trillions_usd', 'T')
        }
        
        for metric_name, (col_name, unit) in metrics.items():
            if col_name in df.columns:
                value = latest_data[col_name]
                if not pd.isna(value):
                    if unit == 'T':
                        print(f"💰 {metric_name}: ${value:.1f}{unit}")
                    elif unit == 'B':
                        print(f"💰 {metric_name}: ${value:.1f}{unit}")
                    else:
                        print(f"📊 {metric_name}: {value:.1f}{unit}")
    
    # Create multi-metric comparison
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Revenue vs Market Value', 'Adoption Trends', 
                       'Job Market Impact', 'Growth Momentum'),
        specs=[[{"secondary_y": True}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Plot 1: Revenue vs Market Value (dual axis)
    if 'ai_software_revenue_in_billions' in df.columns:
        fig.add_trace(
            go.Scatter(x=df['year'], y=df['ai_software_revenue_in_billions'],
                      mode='lines+markers', name='Software Revenue',
                      line=dict(color='blue', width=3)),
            row=1, col=1
        )
    
    if 'global_ai_market_value_in_billions' in df.columns:
        fig.add_trace(
            go.Scatter(x=df['year'], y=df['global_ai_market_value_in_billions'],
                      mode='lines+markers', name='Market Value',
                      line=dict(color='red', width=3)),
            row=1, col=1, secondary_y=True
        )
    
    # Plot 2: Adoption trends
    adoption_cols = [col for col in df.columns if 'adoption' in col.lower() and df[col].dtype in ['int64', 'float64']]
    for i, col in enumerate(adoption_cols[:3]):
        fig.add_trace(
            go.Scatter(x=df['year'], y=df[col],
                      mode='lines+markers', name=col.replace('_', ' ').title()),
            row=1, col=2
        )
    
    # Plot 3: Job impact
    if 'net_job_impact' in df.columns:
        fig.add_trace(
            go.Bar(x=df['year'], y=df['net_job_impact'],
                  name='Net Job Impact',
                  marker_color=['red' if x < 0 else 'green' for x in df['net_job_impact']]),
            row=2, col=1
        )
    
    # Plot 4: Growth momentum
    if 'ai_software_revenue_in_billions_yoy_growth' in df.columns:
        fig.add_trace(
            go.Scatter(x=df['year'], y=df['ai_software_revenue_in_billions_yoy_growth'],
                      mode='lines+markers', name='Revenue Growth Rate',
                      line=dict(color='purple', width=3)),
            row=2, col=2
        )
    
    fig.update_layout(height=800, title_text="AI Market Interactive Dashboard")
    fig.show()

# Create the dashboard
create_interactive_dashboard(market_df)

## 9. Market Anomaly Detection

In [None]:
# Detect anomalies in market data
def detect_market_anomalies(df, target_cols):
    """
    Detect anomalies using statistical methods
    """
    anomalies = {}
    
    for col in target_cols:
        if col in df.columns:
            data = df[col].dropna()
            if len(data) > 3:
                # Z-score method
                z_scores = np.abs(stats.zscore(data))
                z_anomalies = np.where(z_scores > 2)[0]  # 2 standard deviations
                
                # IQR method
                Q1 = data.quantile(0.25)
                Q3 = data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                iqr_anomalies = np.where((data < lower_bound) | (data > upper_bound))[0]
                
                anomalies[col] = {
                    'z_score_anomalies': z_anomalies.tolist(),
                    'iqr_anomalies': iqr_anomalies.tolist(),
                    'z_score_threshold': 2,
                    'iqr_bounds': (lower_bound, upper_bound)
                }
                
                if len(z_anomalies) > 0 or len(iqr_anomalies) > 0:
                    print(f"\n⚠️ Anomalies detected in {col.replace('_', ' ').title()}:")
                    if len(z_anomalies) > 0:
                        years = df.iloc[z_anomalies]['year'].values if 'year' in df.columns else z_anomalies
                        print(f"   Z-score anomalies: {years}")
                    if len(iqr_anomalies) > 0:
                        years = df.iloc[iqr_anomalies]['year'].values if 'year' in df.columns else iqr_anomalies
                        print(f"   IQR anomalies: {years}")
    
    return anomalies

# Detect anomalies in key metrics
key_metrics = [
    'ai_software_revenue_in_billions',
    'global_ai_market_value_in_billions',
    'ai_adoption'
]

anomaly_results = detect_market_anomalies(market_df, key_metrics)

if not any(anomaly_results.values()):
    print("✅ No significant anomalies detected in key metrics")
else:
    print(f"\n📊 Anomaly detection completed for {len(anomaly_results)} metrics")

## 10. Business Insights and Recommendations

In [None]:
# Generate business insights
def generate_business_insights(df, test_results, anomaly_results):
    """
    Generate actionable business insights
    """
    print("💡 BUSINESS INSIGHTS & RECOMMENDATIONS")
    print("=" * 60)
    
    insights = []
    
    # Market growth insights
    if 'revenue_trend' in test_results:
        slope = test_results['revenue_trend']['slope']
        r_squared = test_results['revenue_trend']['r_squared']
        
        if slope > 0 and r_squared > 0.8:
            insights.append(f"🚀 Strong Revenue Growth: AI software revenue is growing at ${slope:.1f}B/year with {r_squared:.1%} consistency")
            insights.append(f"💼 Investment Opportunity: Predictable growth pattern suggests stable investment returns")
    
    # Adoption insights
    if 'ai_adoption' in df.columns:
        latest_adoption = df['ai_adoption'].iloc[-1]
        if latest_adoption > 50:
            insights.append(f"📈 Market Maturity: {latest_adoption:.0f}% adoption indicates mainstream acceptance")
        elif latest_adoption > 30:
            insights.append(f"⚡ Growth Phase: {latest_adoption:.0f}% adoption shows market is in rapid expansion")
        else:
            insights.append(f"🌱 Early Stage: {latest_adoption:.0f}% adoption indicates significant growth potential")
    
    # Job market insights
    if 'job_balance_test' in test_results:
        if test_results['job_balance_test']['jobs_creation_higher']:
            insights.append(f"👥 Positive Job Impact: AI creates more jobs than it eliminates (statistically significant)")
        else:
            insights.append(f"⚖️ Job Market Transition: Mixed impact on employment requires careful monitoring")
    
    # Market efficiency insights
    if 'market_efficiency_ratio' in df.columns:
        avg_efficiency = df['market_efficiency_ratio'].mean()
        if avg_efficiency < 0.3:
            insights.append(f"💎 High Growth Potential: Low revenue-to-market ratio ({avg_efficiency:.2f}) suggests unrealized value")
    
    # Display insights
    for i, insight in enumerate(insights, 1):
        print(f"{i:2d}. {insight}")
    
    # Generate recommendations
    print(f"\n🎯 STRATEGIC RECOMMENDATIONS:")
    print("=" * 40)
    
    recommendations = [
        "🏢 For Businesses: Accelerate AI adoption to gain competitive advantage",
        "💰 For Investors: Consider AI software companies with strong revenue growth",
        "👨‍💼 For Workforce: Invest in AI-complementary skills and reskilling programs",
        "🏛️ For Policymakers: Support transition programs for displaced workers",
        "🔬 For Researchers: Focus on ethical AI and human-AI collaboration"
    ]
    
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
    
    return insights, recommendations

# Generate insights
insights, recommendations = generate_business_insights(market_df, test_results, anomaly_results)

## 11. Export Analysis Results

In [None]:
# Save EDA results and insights
results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

# Create comprehensive analysis summary
analysis_summary = {
    'analysis_timestamp': pd.Timestamp.now().isoformat(),
    'dataset_info': {
        'market_data_shape': market_df.shape,
        'popularity_data_shape': popularity_df.shape,
        'total_features_analyzed': len(market_df.columns)
    },
    'statistical_tests': test_results,
    'anomaly_detection': {k: len(v.get('z_score_anomalies', [])) + len(v.get('iqr_anomalies', []))
                         for k, v in anomaly_results.items()},
    'business_insights': insights,
    'recommendations': recommendations
}

# Save correlation matrices
if 'corr_pearson' in locals():
    corr_pearson.to_csv(results_dir / 'correlation_matrix_pearson.csv')
    print("💾 Saved: correlation_matrix_pearson.csv")

if 'corr_spearman' in locals():
    corr_spearman.to_csv(results_dir / 'correlation_matrix_spearman.csv')
    print("💾 Saved: correlation_matrix_spearman.csv")

# Save strong correlations
if 'strong_correlations' in locals() and not strong_correlations.empty:
    strong_correlations.to_csv(results_dir / 'strong_correlations.csv', index=False)
    print("💾 Saved: strong_correlations.csv")

# Save regional analysis
if 'regional_data' in locals() and regional_data is not None:
    regional_data.to_csv(results_dir / 'regional_ai_popularity.csv', index=False)
    print("💾 Saved: regional_ai_popularity.csv")

# Save insights summary
insights_df = pd.DataFrame({
    'type': ['insight'] * len(insights) + ['recommendation'] * len(recommendations),
    'content': insights + recommendations,
    'timestamp': pd.Timestamp.now().isoformat()
})
insights_df.to_csv(results_dir / 'business_insights.csv', index=False)

print("💾 Saved: business_insights.csv")
print(f"\n📊 EDA Analysis Summary:")
print(f"   ✅ {len(insights)} business insights generated")
print(f"   ✅ {len(recommendations)} strategic recommendations")
print(f"   ✅ {len(test_results)} statistical tests performed")
print(f"   ✅ {len(anomaly_results)} metrics analyzed for anomalies")

print("\n🚀 NEXT STEPS:")
print("   1. Move to 05_model_development.ipynb for ML modeling")
print("   2. Use insights to guide model selection and feature engineering")
print("   3. Focus on high-correlation features for predictions")

print("\n✅ EDA & Visualization Phase Complete!")