# Data Analysis Notebook

This notebook demonstrates common data analysis tasks:
- Loading and exploring data
- Data transformation
- Visualization
- Statistical analysis

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load and Explore Data

In [2]:
def load_dataset(path: str) -> pd.DataFrame:
    """Load dataset from CSV file.
    
    Args:
        path: Path to CSV file
        
    Returns:
        DataFrame with loaded data
    """
    df = pd.read_csv(path)
    print(f"Loaded {len(df)} rows and {len(df.columns)} columns")
    return df

def explore_data(df: pd.DataFrame) -> None:
    """Print basic statistics about the dataset."""
    print("\nDataset Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())

## Data Transformation

In [3]:
class DataTransformer:
    """Handles data transformation operations."""
    
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        self.original = df.copy()
    
    def normalize_columns(self, columns: list) -> 'DataTransformer':
        """Normalize specified columns to 0-1 range."""
        for col in columns:
            if col in self.df.columns:
                min_val = self.df[col].min()
                max_val = self.df[col].max()
                self.df[col] = (self.df[col] - min_val) / (max_val - min_val)
        return self
    
    def fill_missing(self, strategy: str = 'mean') -> 'DataTransformer':
        """Fill missing values using specified strategy."""
        if strategy == 'mean':
            self.df.fillna(self.df.mean(), inplace=True)
        elif strategy == 'median':
            self.df.fillna(self.df.median(), inplace=True)
        elif strategy == 'zero':
            self.df.fillna(0, inplace=True)
        return self
    
    def remove_outliers(self, column: str, threshold: float = 3.0) -> 'DataTransformer':
        """Remove outliers using z-score method."""
        z_scores = np.abs((self.df[column] - self.df[column].mean()) / self.df[column].std())
        self.df = self.df[z_scores < threshold]
        return self
    
    def get_result(self) -> pd.DataFrame:
        """Get transformed DataFrame."""
        return self.df

## Visualization Functions

In [4]:
def plot_distribution(df: pd.DataFrame, column: str, bins: int = 30) -> None:
    """Plot distribution of a numeric column."""
    plt.figure(figsize=(10, 6))
    plt.hist(df[column], bins=bins, edgecolor='black', alpha=0.7)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column}')
    plt.grid(True, alpha=0.3)
    plt.show()

def plot_correlation_matrix(df: pd.DataFrame) -> None:
    """Plot correlation heatmap for numeric columns."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlation = df[numeric_cols].corr()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

def plot_scatter(df: pd.DataFrame, x: str, y: str, hue: str = None) -> None:
    """Create scatter plot with optional color grouping."""
    plt.figure(figsize=(10, 6))
    if hue:
        sns.scatterplot(data=df, x=x, y=y, hue=hue, alpha=0.6)
    else:
        plt.scatter(df[x], df[y], alpha=0.6)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(f'{y} vs {x}')
    plt.grid(True, alpha=0.3)
    plt.show()

## Statistical Analysis

In [5]:
def calculate_statistics(df: pd.DataFrame, column: str) -> dict:
    """Calculate comprehensive statistics for a column."""
    stats = {
        'mean': df[column].mean(),
        'median': df[column].median(),
        'std': df[column].std(),
        'min': df[column].min(),
        'max': df[column].max(),
        'q25': df[column].quantile(0.25),
        'q75': df[column].quantile(0.75),
        'skewness': df[column].skew(),
        'kurtosis': df[column].kurtosis()
    }
    return stats

def compare_groups(df: pd.DataFrame, value_col: str, group_col: str) -> pd.DataFrame:
    """Compare statistics across different groups."""
    return df.groupby(group_col)[value_col].agg([
        'count', 'mean', 'median', 'std', 'min', 'max'
    ])

## Example Usage

In [6]:
# Example: Create sample data
np.random.seed(42)
sample_data = pd.DataFrame({
    'age': np.random.randint(18, 80, 100),
    'income': np.random.normal(50000, 15000, 100),
    'score': np.random.uniform(0, 100, 100),
    'category': np.random.choice(['A', 'B', 'C'], 100)
})

# Transform data
transformer = DataTransformer(sample_data)
clean_data = (transformer
              .fill_missing('mean')
              .normalize_columns(['score'])
              .get_result())

# Calculate statistics
stats = calculate_statistics(clean_data, 'income')
print("Income Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value:.2f}")