In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [3]:
INPUT_FILE = "../ml/moretoo/Ecommerce/train/cleaned_Customers.csv"
OUTPUT_FOLDER = "../ml/graph_output/"

# Visualization settings
SAMPLE_SIZE = 10000  # Sample size for large datasets
MAX_CATEGORIES = 20  # Max categories to show in categorical plots
PLOTLY_TEMPLATE = "plotly_white"  # Options: plotly, plotly_white, plotly_dark, ggplot2

# Create output folder
Path(OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)

print(" AUTOMATIC GRAPH GENERATOR")
print("=" * 70)

 AUTOMATIC GRAPH GENERATOR


In [4]:
print(" Loading data...")
df = pd.read_csv(INPUT_FILE)

# Sample if too large
if len(df) > SAMPLE_SIZE:
    print(f"‚ö†Ô∏è  Dataset has {len(df):,} rows. Sampling {SAMPLE_SIZE:,} for faster visualization...")
    df_viz = df.sample(n=SAMPLE_SIZE, random_state=42)
else:
    df_viz = df.copy()

print(f" Loaded: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f" Visualizing: {df_viz.shape[0]:,} rows")

# Identify column types
numeric_cols = df_viz.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df_viz.select_dtypes(include=['object']).columns.tolist()
datetime_cols = df_viz.select_dtypes(include=['datetime64']).columns.tolist()

print(f" Numeric columns: {len(numeric_cols)}")
print(f" Categorical columns: {len(categorical_cols)}")
print(f" Datetime columns: {len(datetime_cols)}")


 Loading data...
‚ö†Ô∏è  Dataset has 89,316 rows. Sampling 10,000 for faster visualization...
 Loaded: 89,316 rows √ó 4 columns
 Visualizing: 10,000 rows
 Numeric columns: 1
 Categorical columns: 3
 Datetime columns: 0


In [5]:
def generate_profiling_report():
    """Generate comprehensive automatic report"""
    try:
        from ydata_profiling import ProfileReport
        
        print("\nüîç Generating comprehensive profiling report...")
        profile = ProfileReport(
            df_viz, 
            title="Company Data Analysis Report",
            explorative=True,
            minimal=False
        )
        
        report_path = f"{OUTPUT_FOLDER}data_profile_report.html"
        profile.to_file(report_path)
        print(f"‚úÖ Profiling report saved: {report_path}")
        print("   üìñ Open this HTML file to see ALL automatic visualizations!")
        return True
    except ImportError:
        print("\n‚ö†Ô∏è  ydata-profiling not installed. Skipping profiling report.")
        print("   Install with: pip install ydata-profiling")
        return False


generate_profiling_report()



üîç Generating comprehensive profiling report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 78.15it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Profiling report saved: ../ml/graph_output/data_profile_report.html
   üìñ Open this HTML file to see ALL automatic visualizations!


True

In [6]:
print("\n" + "=" * 70)
print("GENERATING OVERVIEW VISUALIZATIONS")
print("=" * 70)

# 3.1 Dataset Overview
fig = go.Figure()
fig.add_trace(go.Indicator(
    mode = "number+delta",
    value = len(df),
    title = {"text": "Total Rows"},
    domain = {'x': [0, 0.33], 'y': [0.5, 1]}
))
fig.add_trace(go.Indicator(
    mode = "number",
    value = len(df.columns),
    title = {"text": "Total Columns"},
    domain = {'x': [0.33, 0.66], 'y': [0.5, 1]}
))
fig.add_trace(go.Indicator(
    mode = "number",
    value = df.memory_usage(deep=True).sum() / 1024**2,
    title = {"text": "Size (MB)"},
    number = {'suffix': " MB"},
    domain = {'x': [0.66, 1], 'y': [0.5, 1]}
))
fig.update_layout(title="Dataset Overview", height=300)
fig.write_html(f"{OUTPUT_FOLDER}01_overview.html")
print(" 01_overview.html")

missing_data = df_viz.isnull().sum()
missing_pct = (missing_data / len(df_viz) * 100).sort_values(ascending=False)

if missing_pct.sum() > 0:
    fig = px.bar(
        x=missing_pct.head(20).index,
        y=missing_pct.head(20).values,
        title="Top 20 Columns by Missing Data %",
        labels={'x': 'Column', 'y': 'Missing %'},
        template=PLOTLY_TEMPLATE
    )
    fig.update_layout(xaxis_tickangle=-45)
    fig.write_html(f"{OUTPUT_FOLDER}02_missing_data.html")
    print(" 02_missing_data.html")



GENERATING OVERVIEW VISUALIZATIONS
 01_overview.html


In [7]:
print(" Generating numeric visualizations...")

for idx, col in enumerate(numeric_cols, 1):
    try:
        # Skip if all nulls
        if df_viz[col].isnull().all():
            continue
        
        # 4.1 Distribution (Histogram + Box)
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                f'{col} - Distribution',
                f'{col} - Box Plot',
                f'{col} - Violin Plot',
                f'{col} - Statistics'
            ),
            specs=[[{"type": "histogram"}, {"type": "box"}],
                   [{"type": "violin"}, {"type": "table"}]]
        )
        
        # Histogram
        fig.add_trace(
            go.Histogram(x=df_viz[col], name='Distribution', nbinsx=50),
            row=1, col=1
        )
        
        # Box plot
        fig.add_trace(
            go.Box(y=df_viz[col], name='Box'),
            row=1, col=2
        )
        
        # Violin plot
        fig.add_trace(
            go.Violin(y=df_viz[col], name='Violin', box_visible=True),
            row=2, col=1
        )
        
        # Statistics table
        stats = df_viz[col].describe()
        fig.add_trace(
            go.Table(
                header=dict(values=['Statistic', 'Value']),
                cells=dict(values=[stats.index, stats.values.round(2)])
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            title=f"Numeric Analysis: {col}",
            showlegend=False,
            height=800,
            template=PLOTLY_TEMPLATE
        )
        
        fig.write_html(f"{OUTPUT_FOLDER}numeric_{idx:02d}_{col[:30]}.html")
        
        if idx <= 3:
            print(f"‚úÖ numeric_{idx:02d}_{col[:30]}.html")
        elif idx == len(numeric_cols):
            print(f"... and {len(numeric_cols) - 3} more numeric visualizations")
            
    except Exception as e:
        print(f"‚ö†Ô∏è  Skipped {col}: {str(e)}")

# 4.2 Correlation Heatmap
if len(numeric_cols) > 1:
    corr_matrix = df_viz[numeric_cols].corr()
    
    fig = px.imshow(
        corr_matrix,
        text_auto='.2f',
        aspect="auto",
        color_continuous_scale='RdBu_r',
        title="Correlation Heatmap - All Numeric Columns",
        template=PLOTLY_TEMPLATE
    )
    fig.write_html(f"{OUTPUT_FOLDER}correlation_heatmap.html")
    print("‚úÖ correlation_heatmap.html")

# 4.3 Pairwise Scatter Matrix (first 5 numeric columns)
if len(numeric_cols) >= 2:
    scatter_cols = numeric_cols[:min(5, len(numeric_cols))]
    fig = px.scatter_matrix(
        df_viz,
        dimensions=scatter_cols,
        title=f"Scatter Matrix: Top {len(scatter_cols)} Numeric Columns",
        template=PLOTLY_TEMPLATE
    )
    fig.update_traces(diagonal_visible=False)
    fig.write_html(f"{OUTPUT_FOLDER}scatter_matrix.html")
    print(" scatter_matrix.html")


 Generating numeric visualizations...
‚úÖ numeric_01_customer_zip_code_prefix.html


In [8]:
print(" Generating categorical visualizations...")

for idx, col in enumerate(categorical_cols, 1):
    try:
        # Skip if all nulls
        if df_viz[col].isnull().all():
            continue
        
        # Get value counts
        value_counts = df_viz[col].value_counts().head(MAX_CATEGORIES)
        
        # 5.1 Bar Chart
        fig = px.bar(
            x=value_counts.index,
            y=value_counts.values,
            title=f"Categorical Distribution: {col} (Top {MAX_CATEGORIES})",
            labels={'x': col, 'y': 'Count'},
            template=PLOTLY_TEMPLATE
        )
        fig.update_layout(xaxis_tickangle=-45)
        fig.write_html(f"{OUTPUT_FOLDER}categorical_{idx:02d}_{col[:30]}_bar.html")
        
        # 5.2 Pie Chart
        fig = px.pie(
            names=value_counts.index,
            values=value_counts.values,
            title=f"Categorical Proportion: {col} (Top {MAX_CATEGORIES})",
            template=PLOTLY_TEMPLATE
        )
        fig.write_html(f"{OUTPUT_FOLDER}categorical_{idx:02d}_{col[:30]}_pie.html")
        
        if idx <= 3:
            print(f"‚úÖ categorical_{idx:02d}_{col[:30]}_bar.html & _pie.html")
        elif idx == len(categorical_cols):
            print(f"... and {len(categorical_cols) - 3} more categorical visualizations")
            
    except Exception as e:
        print(f" Skipped {col}: {str(e)}")


 Generating categorical visualizations...
‚úÖ categorical_01_customer_id_bar.html & _pie.html
‚úÖ categorical_02_customer_city_bar.html & _pie.html
‚úÖ categorical_03_customer_state_bar.html & _pie.html


In [9]:
print(" Generating relationship visualizations...")

# 6.1 Numeric vs Numeric (scatter plots)
if len(numeric_cols) >= 2:
    for i in range(min(3, len(numeric_cols)-1)):
        for j in range(i+1, min(i+2, len(numeric_cols))):
            col1, col2 = numeric_cols[i], numeric_cols[j]
            
            fig = px.scatter(
                df_viz,
                x=col1,
                y=col2,
                title=f"Relationship: {col1} vs {col2}",
                trendline="ols",
                template=PLOTLY_TEMPLATE
            )
            fig.write_html(f"{OUTPUT_FOLDER}relationship_num_{col1[:20]}_vs_{col2[:20]}.html")
    
    print(f" Generated {min(3, len(numeric_cols)-1)} numeric relationship plots")

# 6.2 Categorical vs Numeric (box plots)
if len(categorical_cols) >= 1 and len(numeric_cols) >= 1:
    for cat_col in categorical_cols[:2]:
        for num_col in numeric_cols[:2]:
            # Get top categories
            top_cats = df_viz[cat_col].value_counts().head(10).index
            df_filtered = df_viz[df_viz[cat_col].isin(top_cats)]
            
            fig = px.box(
                df_filtered,
                x=cat_col,
                y=num_col,
                title=f"{num_col} by {cat_col} (Top 10 categories)",
                template=PLOTLY_TEMPLATE
            )
            fig.update_layout(xaxis_tickangle=-45)
            fig.write_html(f"{OUTPUT_FOLDER}relationship_cat_{cat_col[:20]}_vs_{num_col[:20]}.html")
    
    print(f" Generated categorical vs numeric relationship plots")


 Generating relationship visualizations...
 Generated categorical vs numeric relationship plots


In [10]:
print(" Generating advanced visualizations...")

# 7.1 Data Quality Dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Column Type Distribution',
        'Missing Data by Column Type',
        'Data Density',
        'Memory Usage by Column Type'
    ),
    specs=[[{"type": "pie"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "pie"}]]
)

# Column types
type_counts = {
    'Numeric': len(numeric_cols),
    'Categorical': len(categorical_cols),
    'Datetime': len(datetime_cols)
}
fig.add_trace(
    go.Pie(labels=list(type_counts.keys()), values=list(type_counts.values())),
    row=1, col=1
)

# Missing data by type
missing_by_type = {
    'Numeric': df_viz[numeric_cols].isnull().sum().sum() if numeric_cols else 0,
    'Categorical': df_viz[categorical_cols].isnull().sum().sum() if categorical_cols else 0,
}
fig.add_trace(
    go.Bar(x=list(missing_by_type.keys()), y=list(missing_by_type.values())),
    row=1, col=2
)

# Data density (non-null percentage)
density = ((len(df_viz) * len(df_viz.columns)) - df_viz.isnull().sum().sum()) / (len(df_viz) * len(df_viz.columns)) * 100
fig.add_trace(
    go.Bar(x=['Data Density'], y=[density], text=[f'{density:.1f}%']),
    row=2, col=1
)

# Memory usage
memory_by_type = {
    'Numeric': df_viz[numeric_cols].memory_usage(deep=True).sum() / 1024**2 if numeric_cols else 0,
    'Categorical': df_viz[categorical_cols].memory_usage(deep=True).sum() / 1024**2 if categorical_cols else 0,
}
fig.add_trace(
    go.Pie(labels=list(memory_by_type.keys()), values=list(memory_by_type.values())),
    row=2, col=2
)

fig.update_layout(title="Data Quality Dashboard", height=800, showlegend=False)
fig.write_html(f"{OUTPUT_FOLDER}data_quality_dashboard.html")
print("‚úÖ data_quality_dashboard.html")

 Generating advanced visualizations...
‚úÖ data_quality_dashboard.html


In [11]:
print(" Generating statistical visualizations...")

if len(numeric_cols) >= 2:
    # Pair plot for first 4 numeric columns
    plot_cols = numeric_cols[:min(4, len(numeric_cols))]
    
    plt.figure(figsize=(12, 10))
    sns.pairplot(df_viz[plot_cols].dropna(), diag_kind='kde', corner=True)
    plt.savefig(f"{OUTPUT_FOLDER}seaborn_pairplot.png", dpi=300, bbox_inches='tight')
    plt.close()
    print("‚úÖ seaborn_pairplot.png")
    
    # Correlation heatmap with seaborn
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        df_viz[numeric_cols].corr(),
        annot=True,
        fmt='.2f',
        cmap='coolwarm',
        center=0,
        square=True
    )
    plt.title('Correlation Heatmap (Seaborn)')
    plt.savefig(f"{OUTPUT_FOLDER}seaborn_correlation.png", dpi=300, bbox_inches='tight')
    plt.close()
    print("‚úÖ seaborn_correlation.png")


 Generating statistical visualizations...


In [12]:
print(" Generating summary report...")

summary_html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Data Visualization Summary</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; }}
        .container {{ background: white; padding: 30px; border-radius: 10px; }}
        h1 {{ color: #2c3e50; }}
        h2 {{ color: #3498db; border-bottom: 2px solid #3498db; padding-bottom: 10px; }}
        .stats {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 20px; margin: 20px 0; }}
        .stat-box {{ background: #ecf0f1; padding: 20px; border-radius: 5px; text-align: center; }}
        .stat-value {{ font-size: 2em; font-weight: bold; color: #2c3e50; }}
        .stat-label {{ color: #7f8c8d; margin-top: 10px; }}
        ul {{ line-height: 2; }}
        a {{ color: #3498db; text-decoration: none; }}
        a:hover {{ text-decoration: underline; }}
    </style>
</head>
<body>
    <div class="container">
        <h1> Automated Data Visualization Report</h1>
        
        <div class="stats">
            <div class="stat-box">
                <div class="stat-value">{len(df):,}</div>
                <div class="stat-label">Total Rows</div>
            </div>
            <div class="stat-box">
                <div class="stat-value">{len(df.columns)}</div>
                <div class="stat-label">Total Columns</div>
            </div>
            <div class="stat-box">
                <div class="stat-value">{len(numeric_cols) + len(categorical_cols) + len(datetime_cols)}</div>
                <div class="stat-label">Visualizations Generated</div>
            </div>
        </div>
        
        <h2> Overview</h2>
        <ul>
            <li><a href="01_overview.html">Dataset Overview</a></li>
            <li><a href="02_missing_data.html">Missing Data Analysis</a></li>
            <li><a href="data_quality_dashboard.html">Data Quality Dashboard</a></li>
        </ul>
        
        <h2> Numeric Columns ({len(numeric_cols)} columns)</h2>
        <ul>
            {''.join([f'<li><a href="numeric_{i+1:02d}_{col[:30]}.html">{col}</a></li>' for i, col in enumerate(numeric_cols[:20])])}
            {f'<li>... and {len(numeric_cols) - 20} more</li>' if len(numeric_cols) > 20 else ''}
        </ul>
        <ul>
            <li><a href="correlation_heatmap.html">Correlation Heatmap</a></li>
            <li><a href="scatter_matrix.html">Scatter Matrix</a></li>
            <li><a href="seaborn_correlation.png">Seaborn Correlation</a></li>
            <li><a href="seaborn_pairplot.png">Seaborn Pair Plot</a></li>
        </ul>
        
        <h2> Categorical Columns ({len(categorical_cols)} columns)</h2>
        <ul>
            {''.join([f'<li><a href="categorical_{i+1:02d}_{col[:30]}_bar.html">{col} (Bar)</a> | <a href="categorical_{i+1:02d}_{col[:30]}_pie.html">(Pie)</a></li>' for i, col in enumerate(categorical_cols[:20])])}
            {f'<li>... and {len(categorical_cols) - 20} more</li>' if len(categorical_cols) > 20 else ''}
        </ul>
        
        <h2> Relationships</h2>
        <p>Check the graphs_output folder for relationship visualizations between variables.</p>
        
        <h2> Tips</h2>
        <ul>
            <li>Interactive Plotly graphs: Hover for details, zoom, pan, double-click to reset</li>
            <li>Large datasets are sampled to {SAMPLE_SIZE:,} rows for visualization</li>
            <li>Modify the script configuration to adjust sample size or templates</li>
        </ul>
    </div>
</body>
</html>
"""

with open(f"{OUTPUT_FOLDER}index.html", 'w') as f:
    f.write(summary_html)

print(" index.html (main summary page)")


 Generating summary report...
 index.html (main summary page)


In [13]:
print("\n" + "=" * 70)
print(" VISUALIZATION GENERATION COMPLETE!")
print("=" * 70)
print(f" Output folder: {OUTPUT_FOLDER}")
print(f" Total visualizations: {len(list(Path(OUTPUT_FOLDER).glob('*.html'))) + len(list(Path(OUTPUT_FOLDER).glob('*.png')))}")
print(f" Open '{OUTPUT_FOLDER}index.html' in your browser to see all visualizations!")
print(" Next steps:")
print("   1. Open index.html to navigate all visualizations")
print("   2. Explore interactive Plotly graphs")
print("   3. Adjust SAMPLE_SIZE and MAX_CATEGORIES in config if needed")
print("   4. Install ydata-profiling for automatic comprehensive report")
print("=" * 70)


 VISUALIZATION GENERATION COMPLETE!
 Output folder: ../ml/graph_output/
 Total visualizations: 13
 Open '../ml/graph_output/index.html' in your browser to see all visualizations!
 Next steps:
   1. Open index.html to navigate all visualizations
   2. Explore interactive Plotly graphs
   3. Adjust SAMPLE_SIZE and MAX_CATEGORIES in config if needed
   4. Install ydata-profiling for automatic comprehensive report
