# Lab 1: Data Exploration & Visualization - SOLUTIONS

**Introduction to Data Science & Engineering - Day 1**

| Duration | Difficulty | Framework | Exercises |
|---|---|---|---|
| 90 min | Beginner | pandas, matplotlib, seaborn | 5 |

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print("Libraries loaded successfully!")

In [None]:
np.random.seed(42)
n_samples = 2000

# Generate dates over 2 years
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=np.random.randint(0, 730)) for _ in range(n_samples)]

# Customer segments
segments = np.random.choice(['Premium', 'Standard', 'Basic', None], n_samples, p=[0.2, 0.4, 0.3, 0.1])

# Product categories
categories = np.random.choice(['Electronics', 'Clothing', 'Home & Garden', 'Books', 'Sports'], n_samples)

data = {
    'order_id': range(1, n_samples + 1),
    'customer_id': np.random.randint(100, 600, n_samples),
    'order_date': dates,
    'product_category': categories,
    'quantity': np.random.randint(1, 10, n_samples),
    'unit_price': np.round(np.random.uniform(5, 500, n_samples), 2),
    'customer_segment': segments,
    'customer_age': np.random.randint(18, 75, n_samples).astype(float),
    'satisfaction_score': np.random.choice([1, 2, 3, 4, 5, np.nan], n_samples, p=[0.05, 0.1, 0.2, 0.35, 0.2, 0.1]),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
    'is_returned': np.random.choice([0, 1], n_samples, p=[0.85, 0.15])
}

df = pd.DataFrame(data)

# Inject quality issues
df.loc[np.random.choice(df.index, 80, replace=False), 'customer_age'] = np.nan
df.loc[np.random.choice(df.index, 40, replace=False), 'unit_price'] = np.nan
df.loc[np.random.choice(df.index, 10, replace=False), 'unit_price'] = np.random.uniform(2000, 5000, 10)
df.loc[np.random.choice(df.index, 5, replace=False), 'quantity'] = np.random.randint(50, 100, 5)

dup_indices = np.random.choice(df.index, 15, replace=False)
duplicates = df.loc[dup_indices].copy()
df = pd.concat([df, duplicates], ignore_index=True)
df['total_amount'] = df['quantity'] * df['unit_price']

print(f"Dataset shape: {df.shape}")
df.head(10)

## Exercise 1.1: Basic Exploration - SOLUTION

In [None]:
# Check data types and shape
print(f"Shape: {df.shape}")
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(1)
pd.DataFrame({'Missing': missing, 'Percent': missing_pct}).query('Missing > 0')

In [None]:
# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Duplicate order_ids: {df['order_id'].duplicated().sum()}")

## Exercise 2.1: Handle Missing Values - SOLUTION

In [None]:
def handle_missing_values(df):
    """Handle missing values in the dataset."""
    df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
    df['unit_price'].fillna(df['unit_price'].median(), inplace=True)
    df['satisfaction_score'].fillna(df['satisfaction_score'].median(), inplace=True)
    df['customer_segment'].fillna('Unknown', inplace=True)
    df['total_amount'] = df['quantity'] * df['unit_price']
    return df

df = handle_missing_values(df)
print("Missing values after cleaning:")
print(df.isnull().sum())

## Exercise 2.2: Remove Duplicates - SOLUTION

In [None]:
def remove_duplicates(df):
    """Remove duplicate orders, keeping the first occurrence."""
    initial_len = len(df)
    df = df.drop_duplicates(subset='order_id', keep='first')
    removed = initial_len - len(df)
    return df, removed

df, removed = remove_duplicates(df)
print(f"Removed {removed} duplicate rows")
print(f"Clean dataset shape: {df.shape}")

## Exercise 2.3: Detect Outliers - SOLUTION

In [None]:
def detect_outliers_iqr(series, factor=1.5):
    """Detect outliers using the IQR method."""
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - factor * IQR
    upper = Q3 + factor * IQR
    outliers = series[(series < lower) | (series > upper)]
    return outliers, lower, upper

for col in ['unit_price', 'quantity', 'total_amount']:
    outliers, lower, upper = detect_outliers_iqr(df[col])
    print(f"{col}: {len(outliers)} outliers (range: {lower:.2f} to {upper:.2f})")

In [None]:
# Visualize outliers with box plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, col in zip(axes, ['unit_price', 'quantity', 'total_amount']):
    sns.boxplot(data=df, y=col, ax=ax, color='#3b82f6')
    ax.set_title(f'{col} Distribution')
plt.tight_layout()
plt.show()

## Exercise 3.1: Distribution Analysis - SOLUTION

In [None]:
def plot_distributions(df):
    """Plot distributions of key numeric features."""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    sns.histplot(df['unit_price'], bins=50, ax=axes[0,0], color='#3b82f6', kde=True)
    axes[0,0].set_title('Unit Price Distribution')

    sns.histplot(df['customer_age'], bins=30, ax=axes[0,1], color='#8b5cf6', kde=True)
    axes[0,1].set_title('Customer Age Distribution')

    sns.histplot(df['quantity'], bins=20, ax=axes[1,0], color='#10b981', kde=True)
    axes[1,0].set_title('Quantity Distribution')

    sns.histplot(df['satisfaction_score'].dropna(), bins=5, ax=axes[1,1], color='#f59e0b', kde=False)
    axes[1,1].set_title('Satisfaction Score Distribution')

    plt.tight_layout()
    plt.show()

plot_distributions(df)

## Exercise 3.2: Correlation Analysis - SOLUTION

In [None]:
def plot_correlation_matrix(df):
    """Compute and visualize the correlation matrix for numeric columns."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f',
                linewidths=0.5, square=True)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

plot_correlation_matrix(df)

## Exercise 4.1: Category Analysis - SOLUTION

In [None]:
def analyze_categories(df):
    """Analyze sales by product category."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    category_revenue = df.groupby('product_category')['total_amount'].sum().sort_values(ascending=True)
    category_revenue.plot(kind='barh', ax=axes[0], color='#3b82f6')
    axes[0].set_title('Total Revenue by Category')
    axes[0].set_xlabel('Revenue ($)')

    category_counts = df['product_category'].value_counts()
    axes[1].pie(category_counts, labels=category_counts.index, autopct='%1.1f%%',
               colors=['#3b82f6', '#8b5cf6', '#10b981', '#f59e0b', '#ef4444'])
    axes[1].set_title('Order Distribution by Category')

    plt.tight_layout()
    plt.show()

analyze_categories(df)

## Exercise 4.2: Customer Segmentation Analysis - SOLUTION

In [None]:
def analyze_segments(df):
    """Analyze customer segments."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    segment_stats = df.groupby('customer_segment')['total_amount'].agg(['mean', 'count'])
    segment_stats['mean'].sort_values().plot(kind='barh', ax=axes[0], color='#8b5cf6')
    axes[0].set_title('Average Order Value by Segment')
    axes[0].set_xlabel('Average Amount ($)')

    sns.boxplot(data=df, x='customer_segment', y='satisfaction_score', ax=axes[1],
               order=['Basic', 'Standard', 'Premium', 'Unknown'], palette='viridis')
    axes[1].set_title('Satisfaction Score by Segment')
    axes[1].set_xlabel('Segment')

    plt.tight_layout()
    plt.show()

analyze_segments(df)

## Exercise 4.3: Time Series Analysis - SOLUTION

In [None]:
def analyze_time_series(df):
    """Analyze trends over time."""
    df['order_month'] = df['order_date'].dt.to_period('M')

    monthly_stats = df.groupby('order_month').agg(
        revenue=('total_amount', 'sum'),
        orders=('order_id', 'count'),
        avg_order=('total_amount', 'mean')
    ).reset_index()
    monthly_stats['order_month'] = monthly_stats['order_month'].astype(str)

    fig, axes = plt.subplots(2, 1, figsize=(14, 10))

    axes[0].plot(monthly_stats['order_month'], monthly_stats['revenue'], 
                marker='o', color='#3b82f6', linewidth=2)
    axes[0].set_title('Monthly Revenue Trend')
    axes[0].set_ylabel('Revenue ($)')
    axes[0].tick_params(axis='x', rotation=45)

    axes[1].bar(monthly_stats['order_month'], monthly_stats['orders'], color='#8b5cf6', alpha=0.7)
    axes[1].set_title('Monthly Order Count')
    axes[1].set_ylabel('Number of Orders')
    axes[1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

analyze_time_series(df)

## Exercise 4.4: Regional Analysis - SOLUTION

In [None]:
def analyze_regions(df):
    """Analyze performance by region."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    region_revenue = df.groupby('region')['total_amount'].sum().sort_values(ascending=True)
    region_revenue.plot(kind='barh', ax=axes[0], color=['#ef4444', '#f59e0b', '#10b981', '#3b82f6'])
    axes[0].set_title('Revenue by Region')
    axes[0].set_xlabel('Total Revenue ($)')

    region_returns = df.groupby('region')['is_returned'].mean() * 100
    region_returns.sort_values().plot(kind='barh', ax=axes[1], color=['#ef4444', '#f59e0b', '#10b981', '#3b82f6'])
    axes[1].set_title('Return Rate by Region (%)')
    axes[1].set_xlabel('Return Rate (%)')

    plt.tight_layout()
    plt.show()

analyze_regions(df)

## Exercise 5.1: Multi-dimensional Analysis - SOLUTION

In [None]:
def multi_dimensional_analysis(df):
    """Create a comprehensive 2x2 multi-dimensional visualization."""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Scatter: Age vs Total Amount colored by segment
    for segment in df['customer_segment'].unique():
        mask = df['customer_segment'] == segment
        axes[0,0].scatter(df.loc[mask, 'customer_age'], df.loc[mask, 'total_amount'],
                          alpha=0.5, label=segment, s=20)
    axes[0,0].set_title('Age vs Order Amount by Segment')
    axes[0,0].set_xlabel('Customer Age')
    axes[0,0].set_ylabel('Total Amount')
    axes[0,0].legend()

    # Heatmap: Category x Region
    pivot = df.pivot_table(values='total_amount', index='product_category',
                           columns='region', aggfunc='mean')
    sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd', ax=axes[0,1])
    axes[0,1].set_title('Avg Order Value: Category x Region')

    # Return rate by category
    return_by_cat = df.groupby('product_category')['is_returned'].mean() * 100
    return_by_cat.sort_values().plot(kind='barh', ax=axes[1,0], color='#ef4444')
    axes[1,0].set_title('Return Rate by Category (%)')

    # Satisfaction distribution by return status
    sns.violinplot(data=df, x='is_returned', y='satisfaction_score', ax=axes[1,1], palette='coolwarm')
    axes[1,1].set_title('Satisfaction: Returned vs Not Returned')
    axes[1,1].set_xticks([0, 1])
    axes[1,1].set_xticklabels(['Kept', 'Returned'])

    plt.tight_layout()
    plt.show()

multi_dimensional_analysis(df)

## Summary

In this lab, you learned how to:

1. **Generate and load** synthetic datasets with realistic quality issues
2. **Assess data quality** -- missing values, duplicates, outliers
3. **Clean data** using imputation, deduplication, and outlier detection
4. **Compute statistics** and analyze distributions
5. **Visualize data** with bar charts, histograms, scatter plots, heatmaps
6. **Analyze trends** over time using time series grouping
7. **Segment customers** and compare across dimensions

---

*Introduction to Data Science & Engineering | AI Elevate*