# Data Exploration: Canadian Mining Permits - Time and Confidence Analysis

This notebook explores the mining permit dataset to understand patterns in approval time and confidence levels that could help predict both estimated approval duration and likelihood of success.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data

First, let's generate or load sample data for exploration.

In [None]:
from data.data_collection import create_sample_data, load_permit_data
from utils.config import load_config, get_data_path

# Load configuration
config = load_config('../config.yaml')
raw_data_path = get_data_path(config, 'raw')

# Check if sample data exists, if not create it
sample_file = raw_data_path / 'sample_permits.csv'
if not sample_file.exists():
    print("Creating sample data...")
    df = create_sample_data(raw_data_path, n_samples=1000)
else:
    print("Loading existing sample data...")
    df = load_permit_data(sample_file)

print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## 2. Basic Statistics

In [None]:
# Dataset info
print("Dataset Information:")
print(df.info())

print("\nBasic Statistics:")
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
if missing.sum() > 0:
    print("Missing values:")
    print(missing[missing > 0])
else:
    print("No missing values found!")

## 3. Target Variables Analysis - Approval Time & Confidence

In [None]:
# Analyze approval time distribution
avg_time = df['approval_time_months'].mean()
median_time = df['approval_time_months'].median()
print(f"Average Approval Time: {avg_time:.1f} months")
print(f"Median Approval Time: {median_time:.1f} months")

# Analyze confidence distribution
confidence_dist = df['approval_confidence'].value_counts()
print(f"\nConfidence Distribution:")
for level, count in confidence_dist.items():
    percentage = (count / len(df)) * 100
    print(f"  {level}: {count} ({percentage:.1f}%)")

# Plot approval time and confidence distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Approval time histogram
axes[0,0].hist(df['approval_time_months'], bins=20, edgecolor='black', alpha=0.7)
axes[0,0].axvline(avg_time, color='red', linestyle='--', label=f'Mean: {avg_time:.1f}')
axes[0,0].axvline(median_time, color='orange', linestyle='--', label=f'Median: {median_time:.1f}')
axes[0,0].set_title('Distribution of Approval Times')
axes[0,0].set_xlabel('Approval Time (months)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()

# Confidence level bar chart
confidence_dist.plot(kind='bar', ax=axes[0,1], color=['lightcoral', 'lightblue', 'lightgreen'])
axes[0,1].set_title('Confidence Level Distribution')
axes[0,1].set_xlabel('Confidence Level')
axes[0,1].set_ylabel('Count')
axes[0,1].tick_params(axis='x', rotation=0)

# Box plot of approval time by confidence level
df.boxplot(column='approval_time_months', by='approval_confidence', ax=axes[1,0])
axes[1,0].set_title('Approval Time by Confidence Level')
axes[1,0].set_xlabel('Confidence Level')
axes[1,0].set_ylabel('Approval Time (months)')

# Time vs probability scatter plot
axes[1,1].scatter(df['approval_time_months'], df['approval_probability'], alpha=0.6)
axes[1,1].set_title('Approval Time vs Probability')
axes[1,1].set_xlabel('Approval Time (months)')
axes[1,1].set_ylabel('Approval Probability')

plt.tight_layout()
plt.show()

## 4. Geographic Analysis

In [None]:
# Approval time and confidence by province
province_stats = df.groupby('province').agg({
    'approval_time_months': ['mean', 'median', 'std'],
    'approval_probability': ['mean', 'std'],
    'approval_confidence': lambda x: x.value_counts().index[0]  # most common
}).round(2)

province_stats.columns = ['Avg_Time', 'Median_Time', 'Time_Std', 'Avg_Probability', 'Prob_Std', 'Most_Common_Confidence']
province_stats = province_stats.sort_values('Avg_Time')

print("Provincial Statistics:")
print(province_stats)

# Visualize provincial differences
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Average approval time by province
province_stats['Avg_Time'].plot(kind='barh', ax=axes[0,0])
axes[0,0].set_title('Average Approval Time by Province')
axes[0,0].set_xlabel('Average Time (months)')

# Approval probability by province  
province_stats['Avg_Probability'].plot(kind='barh', ax=axes[0,1], color='orange')
axes[0,1].set_title('Average Approval Probability by Province')
axes[0,1].set_xlabel('Average Probability')

# Confidence distribution by province (stacked bar)
conf_by_prov = df.groupby(['province', 'approval_confidence']).size().unstack(fill_value=0)
conf_by_prov.plot(kind='bar', stacked=True, ax=axes[1,0], color=['lightcoral', 'lightblue', 'lightgreen'])
axes[1,0].set_title('Confidence Distribution by Province')
axes[1,0].set_xlabel('Province')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Time variability by province
province_stats['Time_Std'].plot(kind='bar', ax=axes[1,1], color='purple')
axes[1,1].set_title('Approval Time Variability by Province')
axes[1,1].set_xlabel('Province')
axes[1,1].set_ylabel('Standard Deviation (months)')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Mining Type Analysis - Impact on Time and Confidence

In [None]:
# Analysis by mining type
mining_stats = df.groupby('mining_type').agg({
    'approval_time_months': ['mean', 'median', 'std'],
    'approval_probability': 'mean',
    'project_area': 'mean',
    'expected_employment': 'mean'
}).round(2)

mining_stats.columns = ['Avg_Time', 'Median_Time', 'Time_Std', 'Avg_Probability', 'Avg_Area', 'Avg_Employment']

print("Mining Type Statistics:")
print(mining_stats)

# Visualize mining type differences
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Time by mining type
df.boxplot(column='approval_time_months', by='mining_type', ax=axes[0,0])
axes[0,0].set_title('Approval Time by Mining Type')
axes[0,0].set_xlabel('Mining Type')
axes[0,0].set_ylabel('Approval Time (months)')

# Confidence by mining type
conf_by_mining = df.groupby(['mining_type', 'approval_confidence']).size().unstack(fill_value=0)
conf_by_mining.plot(kind='bar', ax=axes[0,1], color=['lightcoral', 'lightblue', 'lightgreen'])
axes[0,1].set_title('Confidence Distribution by Mining Type')
axes[0,1].set_xlabel('Mining Type')
axes[0,1].set_ylabel('Count')
axes[0,1].tick_params(axis='x', rotation=45)

# Project size vs approval time
for mining_type in df['mining_type'].unique():
    subset = df[df['mining_type'] == mining_type]
    axes[1,0].scatter(subset['project_area'], subset['approval_time_months'], 
                     label=mining_type, alpha=0.6)
axes[1,0].set_xlabel('Project Area (hectares)')
axes[1,0].set_ylabel('Approval Time (months)')
axes[1,0].set_title('Project Size vs Approval Time by Mining Type')
axes[1,0].legend()

# Employment vs probability
for mining_type in df['mining_type'].unique():
    subset = df[df['mining_type'] == mining_type]
    axes[1,1].scatter(subset['expected_employment'], subset['approval_probability'], 
                     label=mining_type, alpha=0.6)
axes[1,1].set_xlabel('Expected Employment')
axes[1,1].set_ylabel('Approval Probability')
axes[1,1].set_title('Employment vs Approval Probability by Mining Type')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## 6. Environmental Factors Impact on Approval Process

In [None]:
# Analyze environmental factors impact
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Environmental assessment score vs approval time
axes[0,0].scatter(df['environmental_assessment_score'], df['approval_time_months'], alpha=0.6)
axes[0,0].set_xlabel('Environmental Assessment Score')
axes[0,0].set_ylabel('Approval Time (months)')
axes[0,0].set_title('Environmental Score vs Approval Time')

# Distance to protected area vs approval time
axes[0,1].scatter(df['distance_to_protected_area'], df['approval_time_months'], alpha=0.6, color='orange')
axes[0,1].set_xlabel('Distance to Protected Area (km)')
axes[0,1].set_ylabel('Approval Time (months)')
axes[0,1].set_title('Distance to Protected Area vs Time')

# Distance to water vs approval probability
axes[0,2].scatter(df['distance_to_water'], df['approval_probability'], alpha=0.6, color='green')
axes[0,2].set_xlabel('Distance to Water (km)')
axes[0,2].set_ylabel('Approval Probability')
axes[0,2].set_title('Distance to Water vs Probability')

# Public opposition vs approval time
axes[1,0].scatter(df['public_opposition_percentage'], df['approval_time_months'], alpha=0.6, color='red')
axes[1,0].set_xlabel('Public Opposition (%)')
axes[1,0].set_ylabel('Approval Time (months)')
axes[1,0].set_title('Public Opposition vs Approval Time')

# Environmental score distribution by confidence
for conf_level in df['approval_confidence'].unique():
    subset = df[df['approval_confidence'] == conf_level]
    axes[1,1].hist(subset['environmental_assessment_score'], alpha=0.6, label=conf_level, bins=15)
axes[1,1].set_xlabel('Environmental Assessment Score')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('Environmental Score by Confidence Level')
axes[1,1].legend()

# Indigenous land proximity impact
df['indigenous_proximity'] = pd.cut(df['distance_to_indigenous_land'], 
                                   bins=[0, 5, 20, float('inf')], 
                                   labels=['Very Close (<5km)', 'Close (5-20km)', 'Far (>20km)'])
indigenous_stats = df.groupby('indigenous_proximity')['approval_time_months'].mean()
indigenous_stats.plot(kind='bar', ax=axes[1,2], color='purple')
axes[1,2].set_title('Avg Approval Time by Indigenous Land Proximity')
axes[1,2].set_ylabel('Approval Time (months)')
axes[1,2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Company Factors

In [None]:
# Company size impact
company_approval = df.groupby('company_size')['approved'].mean().sort_values(ascending=False)

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

company_approval.plot(kind='bar', ax=ax[0])
ax[0].set_title('Approval Rate by Company Size')
ax[0].set_ylabel('Approval Rate')
ax[0].axhline(approval_rate, color='red', linestyle='--', alpha=0.5)

# Compliance history vs approval
df.boxplot(column='company_compliance_history', by='approved', ax=ax[1])
ax[1].set_title('Company Compliance History by Approval Status')
ax[1].set_xlabel('Approved (0=No, 1=Yes)')
ax[1].set_ylabel('Compliance History Score')
plt.sca(ax[1])
plt.xticks([1, 2], ['Rejected', 'Approved'])

plt.tight_layout()
plt.show()

## 8. Correlation Analysis

In [None]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Plot heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Features most correlated with approval
approval_corr = correlation_matrix['approved'].sort_values(ascending=False)
print("\nFeatures most correlated with approval:")
print(approval_corr)

## 9. Project Scale Analysis

In [None]:
# Scatter plots for project characteristics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Project area vs approval
for approved in [0, 1]:
    data = df[df['approved'] == approved]
    axes[0].scatter(data['project_area'], data['estimated_duration'], 
                   alpha=0.5, label=f"Approved={approved}")
axes[0].set_xlabel('Project Area (hectares)')
axes[0].set_ylabel('Estimated Duration (years)')
axes[0].set_title('Project Size vs Duration')
axes[0].legend()

# Employment vs approval
for approved in [0, 1]:
    data = df[df['approved'] == approved]
    axes[1].scatter(data['expected_employment'], data['project_area'], 
                   alpha=0.5, label=f"Approved={approved}")
axes[1].set_xlabel('Expected Employment')
axes[1].set_ylabel('Project Area (hectares)')
axes[1].set_title('Employment vs Project Area')
axes[1].legend()

# Public opposition vs approval
df.boxplot(column='public_opposition_percentage', by='approved', ax=axes[2])
axes[2].set_xlabel('Approved (0=No, 1=Yes)')
axes[2].set_ylabel('Public Opposition (%)')
axes[2].set_title('Public Opposition by Approval Status')
plt.sca(axes[2])
plt.xticks([1, 2], ['Rejected', 'Approved'])

plt.tight_layout()
plt.show()

## 8. Key Insights and Recommendations

Based on the analysis above, we can provide the following insights for predicting approval time and confidence:

In [None]:
# Summary insights
print("KEY INSIGHTS FOR MINING PERMIT PREDICTION:")
print("="*50)

print("\n1. APPROVAL TIME FACTORS:")
print(f"   • Average approval time: {df['approval_time_months'].mean():.1f} months")
print(f"   • Range: {df['approval_time_months'].min():.1f} - {df['approval_time_months'].max():.1f} months")
print("   • Longer times typically associated with:")
print("     - Projects close to protected areas")
print("     - High public opposition")
print("     - Lower environmental assessment scores")
print("     - Indigenous land proximity")

print("\n2. CONFIDENCE LEVEL DISTRIBUTION:")
conf_pct = df['approval_confidence'].value_counts(normalize=True) * 100
for level in ['High', 'Medium', 'Low']:
    if level in conf_pct:
        print(f"   • {level} confidence: {conf_pct[level]:.1f}%")

print("\n3. PROVINCIAL VARIATIONS:")
fastest_province = province_stats['Avg_Time'].idxmin()
slowest_province = province_stats['Avg_Time'].idxmax()
print(f"   • Fastest approvals: {fastest_province} ({province_stats.loc[fastest_province, 'Avg_Time']:.1f} months)")
print(f"   • Slowest approvals: {slowest_province} ({province_stats.loc[slowest_province, 'Avg_Time']:.1f} months)")

print("\n4. ENVIRONMENTAL IMPACT:")
high_env_avg = df[df['environmental_assessment_score'] >= 8]['approval_time_months'].mean()
low_env_avg = df[df['environmental_assessment_score'] <= 4]['approval_time_months'].mean()
print(f"   • High env. score (8+): {high_env_avg:.1f} months average")
print(f"   • Low env. score (≤4): {low_env_avg:.1f} months average")

print("\n5. RECOMMENDATIONS FOR ML MODEL:")
print("   • Use regression for time prediction (continuous target)")
print("   • Use multi-class classification for confidence (High/Medium/Low)")
print("   • Key features: environmental score, distances, public opposition, compliance history")
print("   • Consider provincial and mining type interactions")
print("   • Feature engineering: proximity categories, compliance scores")

## Next Steps

1. **Feature Engineering**: Create additional features based on insights
2. **Data Preprocessing**: Clean and prepare data for modeling
3. **Model Development**: Train and evaluate multiple models
4. **Model Interpretation**: Use SHAP values to explain predictions