# CSV Smart Summary Tool
*Automatic Data Exploration and Insight Generation*

This notebook automatically analyzes any uploaded CSV dataset and generates comprehensive statistical summaries, visualizations, correlations, and natural-language insights.

## Section 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## Section 2: Define CSV Upload and Parsing Function

In [None]:
def load_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"CSV loaded successfully!")
        print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
        print(f"\nColumn types detected:")
        for col, dtype in df.dtypes.items():
            print(f"  - {col}: {dtype}")
        return df
    except Exception as e:
        print(f"Error loading CSV: {str(e)}")
        return None

def categorize_columns(df):
    return {
        'numerical': df.select_dtypes(include=[np.number]).columns.tolist(),
        'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(),
        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
    }

## Section 3: Implement Column-Level Summary Analysis

In [None]:
def get_missing_pct(series):
    return f"{(series.isna().sum() / len(series)) * 100:.2f}%"

def summarize_numerical_columns(df, numerical_cols):
    return pd.DataFrame([
        {
            'Column': col,
            'Count': df[col].count(),
            'Mean': df[col].mean(),
            'Median': df[col].median(),
            'Std Dev': df[col].std(),
            'Min': df[col].min(),
            'Max': df[col].max(),
            'Missing %': get_missing_pct(df[col])
        }
        for col in numerical_cols
    ])

def summarize_categorical_columns(df, categorical_cols):
    data = []
    for col in categorical_cols:
        value_counts = df[col].value_counts()
        top_value = value_counts.index[0] if len(value_counts) > 0 else None
        top_count = value_counts.values[0] if len(value_counts) > 0 else 0
        data.append({
            'Column': col,
            'Unique Values': df[col].nunique(),
            'Top Value': top_value,
            'Top Count': top_count,
            'Missing %': get_missing_pct(df[col])
        })
    return pd.DataFrame(data)

def display_column_summaries(df, col_categories):
    print("\n" + "="*80)
    print("COLUMN-LEVEL SUMMARIES")
    print("="*80)
    
    if col_categories['numerical']:
        print("\nNUMERICAL COLUMNS:")
        print("-"*80)
        display(summarize_numerical_columns(df, col_categories['numerical']))
    
    if col_categories['categorical']:
        print("\nCATEGORICAL COLUMNS:")
        print("-"*80)
        display(summarize_categorical_columns(df, col_categories['categorical']))

## Section 4: Create Distribution Visualizations

In [None]:
def visualize_distributions(df, col_categories):
    numerical_cols = col_categories['numerical']
    categorical_cols = col_categories['categorical']
    
    if numerical_cols:
        print("\n" + "="*80)
        print("NUMERICAL DISTRIBUTIONS (Histograms & Box Plots)")
        print("="*80)
        
        n_cols = min(len(numerical_cols), 3)
        n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
        axes = np.atleast_1d(axes).flatten()
        
        for idx, col in enumerate(numerical_cols):
            axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(True, alpha=0.3)
        
        for idx in range(len(numerical_cols), len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()
        
        fig, axes = plt.subplots(1, len(numerical_cols), figsize=(5*len(numerical_cols), 5))
        axes = np.atleast_1d(axes)
        
        for idx, col in enumerate(numerical_cols):
            axes[idx].boxplot(df[col].dropna())
            axes[idx].set_title(f'Box Plot: {col}', fontsize=12, fontweight='bold')
            axes[idx].set_ylabel(col)
            axes[idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    if categorical_cols:
        print("\n" + "="*80)
        print("CATEGORICAL DISTRIBUTIONS (Bar Charts)")
        print("="*80)
        
        for col in categorical_cols:
            top_values = df[col].value_counts().head(10)
            fig, ax = plt.subplots(figsize=(10, 5))
            top_values.plot(kind='bar', color='coral', edgecolor='black', alpha=0.7, ax=ax)
            ax.set_title(f'Top 10 Categories in {col}', fontsize=12, fontweight='bold')
            ax.set_xlabel(col)
            ax.set_ylabel('Frequency')
            ax.grid(True, alpha=0.3, axis='y')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()

## Section 5: Compute and Visualize Correlation Analysis

In [None]:
def extract_strong_correlations(corr_matrix, threshold=0.7):
    strong_corrs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_value = corr_matrix.iloc[i, j]
            if abs(corr_value) >= threshold:
                strong_corrs.append({
                    'Variable 1': corr_matrix.columns[i],
                    'Variable 2': corr_matrix.columns[j],
                    'Correlation': corr_value
                })
    return strong_corrs

def compute_correlations(df, numerical_cols):
    if len(numerical_cols) < 2:
        print("Not enough numerical columns to compute correlations.")
        return None
    
    corr_matrix = df[numerical_cols].corr()
    
    print("\n" + "="*80)
    print("CORRELATION ANALYSIS")
    print("="*80)
    print("\nCorrelation Matrix:")
    display(corr_matrix.round(3))
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                fmt='.2f', square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix Heatmap', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    return corr_matrix

## Section 6: Generate Natural-Language Insights

In [None]:
def generate_insights(df, col_categories, corr_matrix=None):
    insights = []
    insights.append(f"Dataset Overview: {df.shape[0]} rows and {df.shape[1]} columns")
    
    total_missing = df.isna().sum().sum()
    if total_missing > 0:
        missing_pct = (total_missing / (df.shape[0] * df.shape[1])) * 100
        insights.append(f"Missing Data: {total_missing} missing values ({missing_pct:.2f}% of total)")
    else:
        insights.append("Data Quality: No missing values detected")
    
    if col_categories['numerical']:
        insights.append(f"\nNumerical Features: {len(col_categories['numerical'])} columns detected")
        for col in col_categories['numerical']:
            skewness = df[col].skew()
            if abs(skewness) > 1:
                skew_type = "highly left-skewed" if skewness < 0 else "highly right-skewed"
                insights.append(f"  - {col} is {skew_type} (skewness: {skewness:.2f})")
            
            Q1, Q3 = df[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            outliers = ((df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)).sum()
            if outliers > 0:
                outlier_pct = (outliers / df[col].count()) * 100
                insights.append(f"  - {col} contains {outliers} outliers ({outlier_pct:.1f}% of data)")
    
    if col_categories['categorical']:
        insights.append(f"\nCategorical Features: {len(col_categories['categorical'])} columns detected")
        for col in col_categories['categorical']:
            unique_count = df[col].nunique()
            insights.append(f"  - {col} has {unique_count} unique values")
    
    if corr_matrix is not None:
        strong_corrs = extract_strong_correlations(corr_matrix, threshold=0.7)
        if strong_corrs:
            insights.append(f"\nStrong Correlations (|r| > 0.7):")
            for corr in strong_corrs[:5]:
                insights.append(f"  - {corr['Variable 1']} -- {corr['Variable 2']}: {corr['Correlation']:.3f}")
    
    print("\n" + "="*80)
    print("NATURAL-LANGUAGE INSIGHTS")
    print("="*80)
    print("\n".join(insights))

## Section 7: Create Main Analysis Pipeline

In [None]:
def analyze_csv(file_path):
    print("\n" + "="*80)
    print("CSV SMART SUMMARY TOOL - Analysis Started")
    print("="*80)
    
    print("\n[Step 1/6] Loading CSV file...")
    df = load_csv(file_path)
    if df is None:
        return
    
    print("\n[Step 2/6] Categorizing columns...")
    col_categories = categorize_columns(df)
    
    print("\n[Step 3/6] Generating column summaries...")
    display_column_summaries(df, col_categories)
    
    print("\n[Step 4/6] Creating distribution visualizations...")
    visualize_distributions(df, col_categories)
    
    print("\n[Step 5/6] Computing correlations...")
    corr_matrix = compute_correlations(df, col_categories['numerical']) if col_categories['numerical'] else None
    
    print("\n[Step 6/6] Generating insights...")
    generate_insights(df, col_categories, corr_matrix)
    
    print("\n" + "="*80)
    print("Analysis Complete!")
    print("="*80 + "\n")

## Section 8: Test with Sample CSV Data

Below we'll create a sample dataset and test the complete analysis pipeline.

In [None]:
np.random.seed(42)

sample_data = {
    'Age': np.random.randint(18, 80, 150),
    'Income': np.random.normal(50000, 20000, 150),
    'Experience_Years': np.random.randint(0, 40, 150),
    'Department': np.random.choice(['Sales', 'Engineering', 'Marketing', 'HR', 'Finance'], 150),
    'Education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 150),
    'Performance_Score': np.random.uniform(1, 5, 150),
}

sample_data['Income'] = sample_data['Income'] + (sample_data['Experience_Years'] * 1000)
sample_df = pd.DataFrame(sample_data)

sample_df.loc[np.random.choice(sample_df.index, 5, replace=False), 'Income'] = np.nan
sample_df.loc[np.random.choice(sample_df.index, 3, replace=False), 'Performance_Score'] = np.nan

sample_csv_path = '/tmp/sample_employee_data.csv'
sample_df.to_csv(sample_csv_path, index=False)

print(f"Sample dataset created: {sample_csv_path}")
print(f"Shape: {sample_df.shape}")
print(f"\nFirst few rows:")
display(sample_df.head())

### Run Complete Analysis Pipeline

In [None]:
analyze_csv(sample_csv_path)

## How to Use This Tool

### For Your Own Dataset:

1. **Replace the sample data**: In the "Create Sample Dataset" cell, replace the file path with your own CSV file:
   ```python
   analyze_csv('path/to/your/file.csv')
   ```

2. **Supported features**:
   - Automatic detection of numerical and categorical columns
   - Comprehensive statistical summaries
   - Distribution visualizations (histograms, box plots, bar charts)
   - Correlation analysis with heatmaps
   - Natural-language insights about your data

3. **Missing values**: The tool gracefully handles missing values and reports them

### Customization:

- Modify `threshold` parameter in `extract_strong_correlations()` to change correlation sensitivity
- Adjust visualization parameters (e.g., `figsize`, `bins`, color schemes)
- Add new insight generation logic in `generate_insights()` function

---

## Key Features

- Automated: One-line analysis of any CSV
- Comprehensive: Covers statistics, distributions, correlations, and insights
- Visual: Rich plots and heatmaps for pattern discovery
- Insightful: Generates readable, actionable insights
- Robust: Handles missing values and various data types