# CSV Generation, Analysis, and Visualization

This notebook demonstrates:
- Creating a sample CSV with 3 numeric and 2 alphanumeric columns
- Loading data into a pandas DataFrame
- Calculating statistics (mean, median, standard deviation)
- Visualizing the data

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import string

# Enable inline plotting
%matplotlib inline

# Set plot style for better aesthetics
plt.style.use('seaborn-v0_8-whitegrid')

print("Libraries loaded successfully!")

## 2. Generate Sample CSV File

We'll create a CSV with:
- **2 alphanumeric columns**: `id` (random codes) and `category` (Greek letter names)
- **3 numeric columns**: `value_a` (integers), `value_b` (floats), `value_c` (larger integers)

In [None]:
# Configuration
NUM_ROWS = 10
CSV_FILENAME = 'sample.csv'

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [None]:
def random_id(length=6):
    """Generate a random alphanumeric ID."""
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

# Define categories
categories = ['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon']

# Generate the data
data = {
    # Alphanumeric columns
    'id': [random_id() for _ in range(NUM_ROWS)],
    'category': [random.choice(categories) for _ in range(NUM_ROWS)],
    # Numeric columns
    'value_a': np.random.randint(10, 100, NUM_ROWS),
    'value_b': np.round(np.random.uniform(1.0, 50.0, NUM_ROWS), 2),
    'value_c': np.random.randint(100, 1000, NUM_ROWS)
}

# Create DataFrame and save to CSV
df_generated = pd.DataFrame(data)
df_generated.to_csv(CSV_FILENAME, index=False)

print(f"âœ“ Created '{CSV_FILENAME}' with {NUM_ROWS} rows")
print(f"\nGenerated Data Preview:")
df_generated

## 3. Load CSV into Pandas DataFrame

Now we'll read the CSV file back into a pandas DataFrame and inspect its structure.

In [None]:
# Load the CSV file
df = pd.read_csv(CSV_FILENAME)

print("âœ“ Loaded CSV into pandas DataFrame")
print(f"\nShape: {df.shape[0]} rows Ã— {df.shape[1]} columns")
df

In [None]:
# Inspect data types
print("Column Data Types:")
print("-" * 30)
df.dtypes

In [None]:
# Quick info about the DataFrame
df.info()

## 4. Calculate Statistics for Numerical Columns

We'll calculate:
- **Average (Mean)**: The arithmetic mean
- **Median**: The middle value when sorted
- **Standard Deviation**: Measure of data spread

In [None]:
# Identify numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

In [None]:
# Calculate statistics for each numeric column
stats_list = []

for col in numeric_cols:
    avg = df[col].mean()
    median = df[col].median()
    std = df[col].std()
    
    stats_list.append({
        'Column': col,
        'Average': round(avg, 2),
        'Median': round(median, 2),
        'Std Dev': round(std, 2)
    })
    
    print(f"\nðŸ“Š {col}:")
    print(f"   Average (Mean): {avg:.2f}")
    print(f"   Median:         {median:.2f}")
    print(f"   Std Deviation:  {std:.2f}")

In [None]:
# Create summary statistics DataFrame
stats_df = pd.DataFrame(stats_list)
print("\n" + "=" * 50)
print("SUMMARY TABLE")
print("=" * 50)
stats_df

In [None]:
# Alternative: Use pandas built-in describe() for quick stats
print("Pandas describe() output:")
df[numeric_cols].describe()

## 5. Data Visualization

Let's create several plots to visualize our data.

In [None]:
# Define colors for consistency
colors = ['#3498db', '#e74c3c', '#2ecc71']

### 5.1 Mean Values Bar Chart

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

means = [df[col].mean() for col in numeric_cols]
bars = ax.bar(numeric_cols, means, color=colors, edgecolor='black', alpha=0.8)

ax.set_title('Mean Values by Column', fontsize=14, fontweight='bold')
ax.set_ylabel('Mean Value')
ax.set_xlabel('Column')

# Add value labels on bars
for bar, val in zip(bars, means):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
            f'{val:.1f}', ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()

### 5.2 Distribution Box Plots

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

bp = ax.boxplot([df[col] for col in numeric_cols], 
                tick_labels=numeric_cols, 
                patch_artist=True)

for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_title('Distribution of Numeric Columns (Box Plot)', fontsize=14, fontweight='bold')
ax.set_ylabel('Value')

plt.tight_layout()
plt.show()

### 5.3 Line Plot - Values Across Rows

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

for i, col in enumerate(numeric_cols):
    ax.plot(range(len(df)), df[col], marker='o', label=col, 
            color=colors[i], linewidth=2, markersize=8)

ax.set_title('Values Across All Rows', fontsize=14, fontweight='bold')
ax.set_xlabel('Row Index')
ax.set_ylabel('Value')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 5.4 Statistical Measures Comparison (Grouped Bar Chart)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(numeric_cols))
width = 0.25

means = [df[col].mean() for col in numeric_cols]
medians = [df[col].median() for col in numeric_cols]
stds = [df[col].std() for col in numeric_cols]

bars1 = ax.bar(x - width, means, width, label='Mean', color='#3498db', alpha=0.8)
bars2 = ax.bar(x, medians, width, label='Median', color='#e74c3c', alpha=0.8)
bars3 = ax.bar(x + width, stds, width, label='Std Dev', color='#2ecc71', alpha=0.8)

ax.set_title('Statistical Measures Comparison', fontsize=14, fontweight='bold')
ax.set_xlabel('Column')
ax.set_ylabel('Value')
ax.set_xticks(x)
ax.set_xticklabels(numeric_cols)
ax.legend()

plt.tight_layout()
plt.show()

### 5.5 Complete Dashboard (All Plots Together)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 11))
fig.suptitle('Sample CSV Data Analysis Dashboard', fontsize=16, fontweight='bold')

# Plot 1: Bar chart of means
ax1 = axes[0, 0]
means = [df[col].mean() for col in numeric_cols]
bars = ax1.bar(numeric_cols, means, color=colors, edgecolor='black', alpha=0.8)
ax1.set_title('Mean Values by Column')
ax1.set_ylabel('Mean Value')
ax1.set_xlabel('Column')
for bar, val in zip(bars, means):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
            f'{val:.1f}', ha='center', va='bottom', fontsize=10)

# Plot 2: Box plots
ax2 = axes[0, 1]
bp = ax2.boxplot([df[col] for col in numeric_cols], tick_labels=numeric_cols, patch_artist=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax2.set_title('Distribution (Box Plot)')
ax2.set_ylabel('Value')

# Plot 3: Line plot
ax3 = axes[1, 0]
for i, col in enumerate(numeric_cols):
    ax3.plot(range(len(df)), df[col], marker='o', label=col, color=colors[i], linewidth=2)
ax3.set_title('Values Across All Rows')
ax3.set_xlabel('Row Index')
ax3.set_ylabel('Value')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Plot 4: Grouped bar chart
ax4 = axes[1, 1]
x = np.arange(len(numeric_cols))
width = 0.25
means = [df[col].mean() for col in numeric_cols]
medians = [df[col].median() for col in numeric_cols]
stds = [df[col].std() for col in numeric_cols]
ax4.bar(x - width, means, width, label='Mean', color='#3498db', alpha=0.8)
ax4.bar(x, medians, width, label='Median', color='#e74c3c', alpha=0.8)
ax4.bar(x + width, stds, width, label='Std Dev', color='#2ecc71', alpha=0.8)
ax4.set_title('Statistical Measures Comparison')
ax4.set_xlabel('Column')
ax4.set_ylabel('Value')
ax4.set_xticks(x)
ax4.set_xticklabels(numeric_cols)
ax4.legend()

plt.tight_layout()
plt.show()

## 6. Save Visualization (Optional)

In [None]:
# Re-create and save the dashboard
fig, axes = plt.subplots(2, 2, figsize=(14, 11))
fig.suptitle('Sample CSV Data Analysis Dashboard', fontsize=16, fontweight='bold')

# Plot 1: Bar chart of means
ax1 = axes[0, 0]
means = [df[col].mean() for col in numeric_cols]
bars = ax1.bar(numeric_cols, means, color=colors, edgecolor='black', alpha=0.8)
ax1.set_title('Mean Values by Column')
ax1.set_ylabel('Mean Value')
ax1.set_xlabel('Column')
for bar, val in zip(bars, means):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
            f'{val:.1f}', ha='center', va='bottom', fontsize=10)

# Plot 2: Box plots
ax2 = axes[0, 1]
bp = ax2.boxplot([df[col] for col in numeric_cols], tick_labels=numeric_cols, patch_artist=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax2.set_title('Distribution (Box Plot)')
ax2.set_ylabel('Value')

# Plot 3: Line plot
ax3 = axes[1, 0]
for i, col in enumerate(numeric_cols):
    ax3.plot(range(len(df)), df[col], marker='o', label=col, color=colors[i], linewidth=2)
ax3.set_title('Values Across All Rows')
ax3.set_xlabel('Row Index')
ax3.set_ylabel('Value')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Plot 4: Grouped bar chart
ax4 = axes[1, 1]
x = np.arange(len(numeric_cols))
width = 0.25
means = [df[col].mean() for col in numeric_cols]
medians = [df[col].median() for col in numeric_cols]
stds = [df[col].std() for col in numeric_cols]
ax4.bar(x - width, means, width, label='Mean', color='#3498db', alpha=0.8)
ax4.bar(x, medians, width, label='Median', color='#e74c3c', alpha=0.8)
ax4.bar(x + width, stds, width, label='Std Dev', color='#2ecc71', alpha=0.8)
ax4.set_title('Statistical Measures Comparison')
ax4.set_xlabel('Column')
ax4.set_ylabel('Value')
ax4.set_xticks(x)
ax4.set_xticklabels(numeric_cols)
ax4.legend()

plt.tight_layout()
plt.savefig('visualization.png', dpi=150, bbox_inches='tight')
print("âœ“ Visualization saved to 'visualization.png'")
plt.show()

## Summary

In this notebook we:

1. **Generated** a sample CSV file with 10 rows containing:
   - 2 alphanumeric columns (`id`, `category`)
   - 3 numeric columns (`value_a`, `value_b`, `value_c`)

2. **Loaded** the CSV into a pandas DataFrame

3. **Calculated statistics** for numeric columns:
   - Average (Mean)
   - Median
   - Standard Deviation

4. **Visualized** the data with:
   - Bar charts
   - Box plots
   - Line plots
   - Grouped bar charts for statistical comparison