In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import requests
import folium
from math import radians, sin, cos, sqrt, atan2
from datetime import datetime, timedelta
import numpy as np
from folium.plugins import HeatMap
from scipy.spatial.distance import pdist, squareform
from folium.plugins import MarkerCluster
from IPython.display import display
import glob
import os

In [2]:
df = pd.read_csv('Toronto Bikeshare May2023 - Apr2024.csv')

# Rationale for Stratified Sampling in Bikeshare Data Analysis

## Introduction
The Toronto Bikeshare dataset (May 2023 - April 2024) contains over 5.3 million records, which can be computationally intensive for analysis. Implementing a stratified sampling approach allows us to work with a more manageable dataset while maintaining the essential characteristics of the original data.

## Why Stratified Sampling?

### 1. Seasonal Variation
- Bikeshare usage varies significantly by month
- Summer months (June-September) show higher usage
- Winter months (December-February) show lower usage
- Stratification by month ensures representation of all seasonal patterns

### 2. Data Volume
- Original dataset: 5,336,042 records
- Storage and processing challenges with full dataset
- Need for efficient analysis while maintaining data integrity
- 10% sample reduces data to ~533,604 records

### 3. Monthly Distribution in Original Data
```
2023-05: 512,228 records
2023-06: 584,517 records
2023-07: 650,399 records
2023-08: 676,439 records
2023-09: 673,179 records
2023-10: 544,630 records
2023-11: 361,306 records
2023-12: 237,125 records
2024-01: 189,684 records
2024-02: 242,123 records
2024-03: 289,239 records
2024-04: 375,173 records
```

## Sampling Methodology

### 1. Proportional Stratification
- Each month sampled independently
- Sampling fraction: 10% of each month's records
- Maintains original monthly distribution patterns

### 2. Random Selection
- Random seed set for reproducibility
- Within each month stratum, random selection of records
- Prevents selection bias within months

### 3. Benefits of This Approach
- Preserves seasonal patterns
- Maintains proportional representation of each month
- Reduces computational requirements
- Enables more efficient analysis
- Ensures representation of low-usage periods

## Implementation Considerations

### 1. Sample Size Determination
- 10% chosen as it provides:
  * Sufficient data for statistical analysis
  * Manageable dataset size
  * Adequate representation of rare events
  * Balance between precision and efficiency

### 2. Stratification Variable
- Month chosen as primary stratification variable because:
  * Strong seasonal patterns in bikeshare usage
  * Different user behaviors across seasons
  * Varying weather conditions impact usage
  * Monthly operational patterns

### 3. Random Seed
- Fixed random seed (42) used for:
  * Reproducibility of results
  * Consistency in repeated analyses
  * Ability to verify findings

## Expected Outcomes
- Reduced dataset size while maintaining:
  * Seasonal patterns
  * Usage distributions
  * Key relationships between variables
  * Representative sample of user behaviors

In [3]:
def simple_stratified_sample(df, sample_fraction=0.1, random_state=42):
    """
    Simplified version of stratified sampling that takes 10% of each month's data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe containing bikeshare data
    sample_fraction : float, default=0.1
        Fraction of data to sample from each month
    random_state : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    pandas.DataFrame
        Sampled dataframe
    dict
        Basic sampling statistics
    """
    # Ensure Date is datetime
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        df['Date'] = pd.to_datetime(df['Date'])
    
    # Create Month-Year column
    df['Month-Year'] = df['Date'].dt.to_period('M')
    
    # Calculate sample sizes
    monthly_counts = df['Month-Year'].value_counts()
    monthly_sample_sizes = (monthly_counts * sample_fraction).round().astype(int)
    
    # Initialize statistics dictionary
    stats = {
        'original_total': len(df),
        'monthly_stats': {}
    }
    
    # Perform sampling
    sampled_df = pd.DataFrame()
    
    for month, sample_size in monthly_sample_sizes.items():
        # Get month's data
        monthly_data = df[df['Month-Year'] == month]
        
        # Sample from this month
        month_sample = monthly_data.sample(n=sample_size, random_state=random_state)
        
        # Add to sampled dataframe
        sampled_df = pd.concat([sampled_df, month_sample])
        
        # Store statistics
        stats['monthly_stats'][str(month)] = {
            'original_count': len(monthly_data),
            'sampled_count': sample_size,
            'sampling_ratio': (sample_size / len(monthly_data)) * 100
        }
    
    # Reset index
    sampled_df.reset_index(drop=True, inplace=True)
    
    # Add final statistics
    stats['sampled_total'] = len(sampled_df)
    stats['overall_sampling_ratio'] = (len(sampled_df) / len(df)) * 100
    
    return sampled_df, stats

def print_simple_summary(stats):
    """
    Print a simple summary of the sampling results.
    """
    print(f"\nTotal Records:")
    print(f"Original: {stats['original_total']:,}")
    print(f"Sampled: {stats['sampled_total']:,}")
    print(f"Overall sampling ratio: {stats['overall_sampling_ratio']:.1f}%")
    
    print("\nMonthly Breakdown:")
    print("-" * 60)
    print("Month-Year | Original Count | Sampled Count | Sampling Ratio")
    print("-" * 60)
    
    for month, data in sorted(stats['monthly_stats'].items()):
        print(f"{month:9} | {data['original_count']:13,d} | "
              f"{data['sampled_count']:12,d} | {data['sampling_ratio']:13.1f}%")


sampled_df, stats = simple_stratified_sample(df)
print_simple_summary(stats)


Total Records:
Original: 5,336,042
Sampled: 533,604
Overall sampling ratio: 10.0%

Monthly Breakdown:
------------------------------------------------------------
Month-Year | Original Count | Sampled Count | Sampling Ratio
------------------------------------------------------------
2023-05   |       512,228 |       51,223 |          10.0%
2023-06   |       584,517 |       58,452 |          10.0%
2023-07   |       650,399 |       65,040 |          10.0%
2023-08   |       676,439 |       67,644 |          10.0%
2023-09   |       673,179 |       67,318 |          10.0%
2023-10   |       544,630 |       54,463 |          10.0%
2023-11   |       361,306 |       36,131 |          10.0%
2023-12   |       237,125 |       23,712 |          10.0%
2024-01   |       189,684 |       18,968 |          10.0%
2024-02   |       242,123 |       24,212 |          10.0%
2024-03   |       289,239 |       28,924 |          10.0%
2024-04   |       375,173 |       37,517 |          10.0%


# Rationale for Comparing Original and Sampled Datasets

## Introduction
After creating a stratified sample, it's crucial to validate that the sample accurately represents the original dataset. This comparison ensures that any analyses performed on the sample will yield reliable insights applicable to the full dataset.

## Why Compare Datasets?

### 1. Validation of Sampling Method
- Verify that stratified sampling maintained data characteristics
- Ensure no systematic bias was introduced
- Confirm representation of key patterns and relationships

### 2. Statistical Integrity
- Verify distribution shapes
- Confirm maintenance of central tendencies
- Validate spread and variability measures

### 3. Quality Assurance
- Detect any potential sampling errors
- Identify any lost information
- Ensure reliability for further analysis

## Comparison Methodology

### 1. Descriptive Statistics
- Compare basic statistics between datasets:
  * Mean, median, standard deviation
  * Minimum and maximum values
  * Quartile values
  * Distribution shapes

### 2. Statistical Tests
- Kolmogorov-Smirnov (KS) test
  * Tests if samples come from same distribution
  * p-value > 0.05 indicates similar distributions
- Chi-square test for categorical variables
  * Tests if categorical distributions match
  * Validates user type and temporal patterns

### 3. Outlier Analysis
- Z-score method (|z| > 3)
  * Identifies extreme values
  * Compares outlier proportions
- IQR method (1.5 * IQR)
  * Checks for distribution tails
  * Validates range of values

## Key Metrics for Comparison

### 1. Numeric Variables
- Trip Duration (min)
- Distance (km)
- Speed (km/h)
- Temperature
- Wind Speed
- Relative Humidity

### 2. Categorical Variables
- User Type
- Day of Week
- Hour of Day

### 3. Temporal Patterns
- Hourly distribution
- Daily patterns
- Monthly trends

## Success Criteria

### 1. Statistical Similarity
- p-values > 0.05 in statistical tests
- Percent differences < 1% in key metrics
- Similar outlier proportions

### 2. Distribution Matching
- Similar shapes in distributions
- Maintained relationships between variables
- Preserved temporal patterns

### 3. Practical Significance
- Differences should not affect analysis conclusions
- Maintained business-relevant patterns
- Preserved key relationships for modeling

## Importance of Comprehensive Comparison

### 1. Research Validity
- Ensures sample can be used for analysis
- Validates generalizability of findings
- Supports research conclusions

### 2. Business Impact
- Confirms reliability for decision-making
- Validates patterns important for operations
- Ensures representative insights



In [4]:
def compare_bikeshare_stats(df, sampled_df):
    """
    Compare statistical parameters between full and sampled bikeshare datasets.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Full bikeshare dataset
    sampled_df : pandas.DataFrame
        Sampled dataset
    
    Returns:
    --------
    dict
        Dictionary containing comparison statistics
    """
    
    # Define key numeric columns for detailed analysis
    numeric_cols = [
        'Trip Duration (min)', 
        'Distance (km)', 
        'Speed (km/h)',
        'Temperature',
        'Wind Speed',
        'Relative Humidity'
    ]
    
    # Initialize results dictionary
    results = {
        'numeric_stats': {},
        'categorical_distributions': {},
        'temporal_patterns': {},
        'statistical_tests': {}
    }
    
    # 1. Numeric Statistics Comparison
    for col in numeric_cols:
        if col in df.columns and col in sampled_df.columns:
            full_stats = df[col].describe()
            sample_stats = sampled_df[col].describe()
            
            # Calculate percent differences
            pct_diff = {
                stat: ((sample_stats[stat] - full_stats[stat]) / full_stats[stat] * 100)
                for stat in ['mean', 'std', '50%']
                if full_stats[stat] != 0
            }
            
            results['numeric_stats'][col] = {
                'full': full_stats.to_dict(),
                'sampled': sample_stats.to_dict(),
                'percent_difference': pct_diff,
                'ks_test_pvalue': stats.ks_2samp(
                    df[col].dropna(),
                    sampled_df[col].dropna()
                )[1]
            }
    
    # 2. Categorical Distributions
    categorical_cols = ['User Type', 'Day of Week']
    for col in categorical_cols:
        full_dist = df[col].value_counts(normalize=True)
        sample_dist = sampled_df[col].value_counts(normalize=True)
        
        results['categorical_distributions'][col] = {
            'full': full_dist.to_dict(),
            'sampled': sample_dist.to_dict(),
            'chi_square_pvalue': stats.chi2_contingency(
                pd.crosstab(df[col], 'Full')
                .join(pd.crosstab(sampled_df[col], 'Sampled'))
                .fillna(0)
            )[1]
        }
    
    # 3. Temporal Patterns
    # Hour of day distribution
    full_hourly = df['Hour'].value_counts(normalize=True)
    sample_hourly = sampled_df['Hour'].value_counts(normalize=True)
    results['temporal_patterns']['hourly'] = {
        'full': full_hourly.to_dict(),
        'sampled': sample_hourly.to_dict()
    }
    
    return results

def print_comparison_results(results):
    """
    Print formatted comparison results.
    """
    print("\nNUMERIC VARIABLES COMPARISON")
    print("=" * 80)
    for col, stats in results['numeric_stats'].items():
        print(f"\n{col}:")
        print("-" * 50)
        metrics = ['mean', 'std', '50%', 'min', 'max']
        print(f"{'Metric':<12} {'Full':>12} {'Sampled':>12} {'% Diff':>10}")
        print("-" * 50)
        for metric in metrics:
            full_val = stats['full'][metric]
            sample_val = stats['sampled'][metric]
            if metric in stats['percent_difference']:
                pct_diff = stats['percent_difference'][metric]
                print(f"{metric:<12} {full_val:>12.2f} {sample_val:>12.2f} {pct_diff:>10.2f}%")
            else:
                print(f"{metric:<12} {full_val:>12.2f} {sample_val:>12.2f} {'N/A':>10}")
        print(f"KS test p-value: {stats['ks_test_pvalue']:.4f}")
    
    print("\nCATEGORICAL VARIABLES COMPARISON")
    print("=" * 80)
    for col, stats in results['categorical_distributions'].items():
        print(f"\n{col}:")
        print("-" * 50)
        print(f"{'Category':<15} {'Full %':>10} {'Sampled %':>10} {'Diff':>10}")
        print("-" * 50)
        all_categories = set(stats['full'].keys()) | set(stats['sampled'].keys())
        for category in sorted(all_categories):
            full_pct = stats['full'].get(category, 0) * 100
            sample_pct = stats['sampled'].get(category, 0) * 100
            diff = sample_pct - full_pct
            print(f"{str(category):<15} {full_pct:>10.2f} {sample_pct:>10.2f} {diff:>10.2f}")
        print(f"Chi-square test p-value: {stats['chi_square_pvalue']:.4f}")
    
    print("\nHOURLY DISTRIBUTION")
    print("=" * 80)
    hours = range(24)
    print(f"{'Hour':<6} {'Full %':>10} {'Sampled %':>10} {'Diff':>10}")
    print("-" * 40)
    for hour in hours:
        full_pct = results['temporal_patterns']['hourly']['full'].get(hour, 0) * 100
        sample_pct = results['temporal_patterns']['hourly']['sampled'].get(hour, 0) * 100
        diff = sample_pct - full_pct
        print(f"{hour:<6} {full_pct:>10.2f} {sample_pct:>10.2f} {diff:>10.2f}")

# Generate and print comparison
comparison_results = compare_bikeshare_stats(df, sampled_df)
print_comparison_results(comparison_results)




NUMERIC VARIABLES COMPARISON

Trip Duration (min):
--------------------------------------------------
Metric               Full      Sampled     % Diff
--------------------------------------------------
mean                14.56        14.57       0.02%
std                 13.34        13.28      -0.40%
50%                 11.00        11.00       0.00%
min                  1.00         1.00        N/A
max                300.00       299.00        N/A
KS test p-value: 0.6495

Distance (km):
--------------------------------------------------
Metric               Full      Sampled     % Diff
--------------------------------------------------
mean                 1.97         1.97       0.02%
std                  1.41         1.41       0.20%
50%                  1.60         1.60      -0.11%
min                  0.01         0.02        N/A
max                 28.88        25.36        N/A
KS test p-value: 0.5851

Speed (km/h):
--------------------------------------------------
Metric  

In [6]:
def analyze_outliers(df, sampled_df, columns=None):
    """
    Analyze outliers in both datasets using Z-score and IQR methods.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Full dataset
    sampled_df : pandas.DataFrame
        Sampled dataset
    columns : list, optional
        List of numeric columns to analyze. If None, will analyze all numeric columns
        
    Returns:
    --------
    dict
        Dictionary containing outlier analysis results
    """
    if columns is None:
        columns = ['Trip Duration (min)', 'Distance (km)', 'Speed (km/h)', 
                  'Temperature', 'Wind Speed', 'Relative Humidity']
    
    results = {
        'zscore': {'full': {}, 'sampled': {}},
        'iqr': {'full': {}, 'sampled': {}},
        'summary': {}
    }
    
    for col in columns:
        # Z-score analysis
        z_full = np.abs(stats.zscore(df[col], nan_policy='omit'))
        z_sampled = np.abs(stats.zscore(sampled_df[col], nan_policy='omit'))
        
        results['zscore']['full'][col] = {
            'outliers_count': np.sum(z_full > 3),
            'outliers_percentage': (np.sum(z_full > 3) / len(df)) * 100,
            'max_zscore': np.max(z_full),
            'outlier_values': df[col][z_full > 3].describe().to_dict()
        }
        
        results['zscore']['sampled'][col] = {
            'outliers_count': np.sum(z_sampled > 3),
            'outliers_percentage': (np.sum(z_sampled > 3) / len(sampled_df)) * 100,
            'max_zscore': np.max(z_sampled),
            'outlier_values': sampled_df[col][z_sampled > 3].describe().to_dict()
        }
        
        # IQR analysis
        Q1_full = df[col].quantile(0.25)
        Q3_full = df[col].quantile(0.75)
        IQR_full = Q3_full - Q1_full
        
        Q1_sampled = sampled_df[col].quantile(0.25)
        Q3_sampled = sampled_df[col].quantile(0.75)
        IQR_sampled = Q3_sampled - Q1_sampled
        
        lower_bound_full = Q1_full - 1.5 * IQR_full
        upper_bound_full = Q3_full + 1.5 * IQR_full
        
        lower_bound_sampled = Q1_sampled - 1.5 * IQR_sampled
        upper_bound_sampled = Q3_sampled + 1.5 * IQR_sampled
        
        outliers_full = df[col][(df[col] < lower_bound_full) | (df[col] > upper_bound_full)]
        outliers_sampled = sampled_df[col][(sampled_df[col] < lower_bound_sampled) | 
                                         (sampled_df[col] > upper_bound_sampled)]
        
        results['iqr']['full'][col] = {
            'Q1': Q1_full,
            'Q3': Q3_full,
            'IQR': IQR_full,
            'lower_bound': lower_bound_full,
            'upper_bound': upper_bound_full,
            'outliers_count': len(outliers_full),
            'outliers_percentage': (len(outliers_full) / len(df)) * 100,
            'outlier_values': outliers_full.describe().to_dict()
        }
        
        results['iqr']['sampled'][col] = {
            'Q1': Q1_sampled,
            'Q3': Q3_sampled,
            'IQR': IQR_sampled,
            'lower_bound': lower_bound_sampled,
            'upper_bound': upper_bound_sampled,
            'outliers_count': len(outliers_sampled),
            'outliers_percentage': (len(outliers_sampled) / len(sampled_df)) * 100,
            'outlier_values': outliers_sampled.describe().to_dict()
        }
        
        # Summary comparison
        results['summary'][col] = {
            'zscore_diff': abs(results['zscore']['full'][col]['outliers_percentage'] - 
                             results['zscore']['sampled'][col]['outliers_percentage']),
            'iqr_diff': abs(results['iqr']['full'][col]['outliers_percentage'] - 
                          results['iqr']['sampled'][col]['outliers_percentage'])
        }
    
    return results

def print_outlier_analysis(results):
    """
    Print formatted outlier analysis results.
    """
    print("\nOUTLIER ANALYSIS RESULTS")
    print("=" * 100)
    
    for col in results['zscore']['full'].keys():
        print(f"\n{col}:")
        print("-" * 80)
        
        # Z-score results
        print("\nZ-score Method (|z| > 3):")
        print(f"{'Dataset':<10} {'Outliers Count':>15} {'Percentage':>12} {'Max Z-score':>12}")
        print("-" * 55)
        print(f"{'Full':<10} {results['zscore']['full'][col]['outliers_count']:>15,d} "
              f"{results['zscore']['full'][col]['outliers_percentage']:>11.2f}% "
              f"{results['zscore']['full'][col]['max_zscore']:>12.2f}")
        print(f"{'Sampled':<10} {results['zscore']['sampled'][col]['outliers_count']:>15,d} "
              f"{results['zscore']['sampled'][col]['outliers_percentage']:>11.2f}% "
              f"{results['zscore']['sampled'][col]['max_zscore']:>12.2f}")
        
        # IQR results
        print("\nIQR Method (1.5 * IQR):")
        print(f"{'Dataset':<10} {'Outliers Count':>15} {'Percentage':>12} "
              f"{'Lower Bound':>12} {'Upper Bound':>12}")
        print("-" * 70)
        print(f"{'Full':<10} {results['iqr']['full'][col]['outliers_count']:>15,d} "
              f"{results['iqr']['full'][col]['outliers_percentage']:>11.2f}% "
              f"{results['iqr']['full'][col]['lower_bound']:>12.2f} "
              f"{results['iqr']['full'][col]['upper_bound']:>12.2f}")
        print(f"{'Sampled':<10} {results['iqr']['sampled'][col]['outliers_count']:>15,d} "
              f"{results['iqr']['sampled'][col]['outliers_percentage']:>11.2f}% "
              f"{results['iqr']['sampled'][col]['lower_bound']:>12.2f} "
              f"{results['iqr']['sampled'][col]['upper_bound']:>12.2f}")
        
        # Outlier value statistics
        print("\nZ-score Outlier Statistics:")
        stats_to_show = ['count', 'mean', 'min', 'max']
        print(f"{'Metric':<10} {'Full':>12} {'Sampled':>12}")
        print("-" * 35)
        for stat in stats_to_show:
            full_val = results['zscore']['full'][col]['outlier_values'][stat]
            samp_val = results['zscore']['sampled'][col]['outlier_values'][stat]
            print(f"{stat:<10} {full_val:>12.2f} {samp_val:>12.2f}")

# Example usage
columns_to_analyze = ['Trip Duration (min)', 'Distance (km)', 'Speed (km/h)']
outlier_results = analyze_outliers(df, sampled_df, columns=columns_to_analyze)
print_outlier_analysis(outlier_results)


OUTLIER ANALYSIS RESULTS

Trip Duration (min):
--------------------------------------------------------------------------------

Z-score Method (|z| > 3):
Dataset     Outliers Count   Percentage  Max Z-score
-------------------------------------------------------
Full                89,963        1.69%        21.40
Sampled              8,973        1.68%        21.42

IQR Method (1.5 * IQR):
Dataset     Outliers Count   Percentage  Lower Bound  Upper Bound
----------------------------------------------------------------------
Full               292,388        5.48%        -9.50        34.50
Sampled             29,369        5.50%        -9.50        34.50

Z-score Outlier Statistics:
Metric             Full      Sampled
-----------------------------------
count          89963.00      8973.00
mean              82.05        81.76
min               55.00        55.00
max              300.00       299.00

Distance (km):
---------------------------------------------------------------------

# Statistical Analysis of Toronto Bikeshare Data Sampling

## 1. Overview of Statistical Comparison

### 1.1 Numeric Variables Analysis
The comparison of numeric variables shows exceptional consistency between the full and sampled datasets:

#### Trip Duration
- Mean difference: 0.02% (14.56 vs 14.57 minutes)
- Standard deviation difference: -0.40% (13.34 vs 13.28)
- KS test p-value: 0.6495 (indicates similar distributions)
- Range maintained: 1-300 minutes (full) vs 1-299 minutes (sampled)

#### Distance
- Mean difference: 0.02% (1.97 km in both)
- Standard deviation difference: 0.20%
- KS test p-value: 0.5851
- Minor difference in maximum values: 28.88 km vs 25.36 km

#### Speed
- Mean difference: -0.05% (9.25 vs 9.24 km/h)
- Standard deviation difference: -0.09%
- KS test p-value: 0.7388
- Consistent range: 0.50-39.98 km/h vs 0.50-39.88 km/h

### 1.2 Categorical Variables
The sampling maintained nearly identical categorical distributions:

#### User Type Distribution
- Annual Members: 5.74% in both datasets (diff: -0.01)
- Casual Members: 94.26% in both datasets (diff: 0.01)
- Chi-square p-value: 0.8781 (strong similarity)

#### Day of Week Distribution
- Maximum difference: -0.07% (Tuesday)
- Most days show differences < 0.05%
- Chi-square p-value: 0.8862
- Maintained weekly patterns

### 1.3 Hourly Distribution
Temporal patterns were preserved with high accuracy:

- Peak hour (17:00): 10.58% vs 10.56% (diff: -0.02%)
- Morning peak (08:00): 6.60% vs 6.61% (diff: 0.01%)
- Maximum difference: 0.06% at 18:00
- Maintained 24-hour usage pattern

## 2. Outlier Analysis Results

### 2.1 Trip Duration Outliers

#### Z-score Method (|z| > 3)
- Full dataset: 1.69% outliers (89,963 trips)
- Sampled dataset: 1.68% outliers (8,973 trips)
- Very consistent outlier percentage
- Similar max z-scores: 21.40 vs 21.42
- Outlier characteristics:
  * Mean duration: 82.05 vs 81.76 minutes
  * Range: 55-300 vs 55-299 minutes

#### IQR Method
- Full dataset: 5.48% outliers (292,388 trips)
- Sampled dataset: 5.50% outliers (29,369 trips)
- Identical boundaries: [-9.50, 34.50] minutes
- Higher detection rate than z-score method

### 2.2 Distance Outliers

#### Z-score Method (|z| > 3)
- Full dataset: 1.88% outliers (100,560 trips)
- Sampled dataset: 1.88% outliers (10,030 trips)
- Max z-scores: 19.08 vs 16.55
- Outlier characteristics:
  * Mean distance: 7.46 vs 7.47 km
  * Range: 6.20-28.88 vs 6.21-25.36 km

#### IQR Method
- Full dataset: 4.84% outliers (258,486 trips)
- Sampled dataset: 4.85% outliers (25,890 trips)
- Nearly identical boundaries: [-1.31, 4.82] vs [-1.32, 4.83] km

### 2.3 Speed Outliers

#### Z-score Method (|z| > 3)
- Full dataset: 0.29% outliers (15,526 trips)
- Sampled dataset: 0.28% outliers (1,515 trips)
- Similar max z-scores: 9.08 vs 9.06
- Outlier characteristics:
  * Mean speed: 21.63 vs 21.54 km/h
  * Range: 19.40-39.98 vs 19.39-39.88 km/h

#### IQR Method
- Full dataset: 2.26% outliers (120,829 trips)
- Sampled dataset: 2.29% outliers (12,199 trips)
- Consistent boundaries: [1.14, 17.49] vs [1.14, 17.48] km/h

## 3. Key Findings

### 3.1 Sampling Quality
1. All statistical tests show strong similarity (p-values > 0.5)
2. Percentage differences consistently < 0.5%
3. Maintained distributions across all variable types
4. Preserved temporal patterns and user type distributions

### 3.2 Outlier Representation
1. Consistent outlier percentages across both methods
2. Maintained extreme value characteristics
3. Similar outlier boundaries and statistics
4. Proper representation of unusual trips

### 3.3 Practical Implications
1. Sample is highly representative of full dataset
2. Suitable for detailed analysis and modeling
3. Maintains data quality and characteristics
4. Captures both normal and unusual patterns

In [5]:
sampled_df.to_csv('Toronto Bikeshare Sampled dataset.csv', index=False)