# Data Exploration: Canadian Mining Permits

This notebook explores the mining permit dataset to understand patterns and relationships that could help predict approval likelihood.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data

First, let's generate or load sample data for exploration.

In [None]:
from data.data_collection import create_sample_data, load_permit_data
from utils.config import load_config, get_data_path

# Load configuration
config = load_config('../config.yaml')
raw_data_path = get_data_path(config, 'raw')

# Check if sample data exists, if not create it
sample_file = raw_data_path / 'sample_permits.csv'
if not sample_file.exists():
    print("Creating sample data...")
    df = create_sample_data(raw_data_path, n_samples=1000)
else:
    print("Loading existing sample data...")
    df = load_permit_data(sample_file)

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 2. Basic Statistics

In [None]:
# Dataset info
print("Dataset Information:")
print(df.info())

print("\nBasic Statistics:")
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
if missing.sum() > 0:
    print("Missing values:")
    print(missing[missing > 0])
else:
    print("No missing values found!")

## 3. Target Variable Analysis

In [None]:
# Approval rate
approval_rate = df['approved'].mean()
print(f"Overall Approval Rate: {approval_rate:.2%}")

# Plot approval distribution
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['approved'].value_counts().plot(kind='bar', ax=ax[0])
ax[0].set_title('Permit Approval Distribution')
ax[0].set_xlabel('Approved (1=Yes, 0=No)')
ax[0].set_ylabel('Count')
ax[0].set_xticklabels(['Rejected', 'Approved'], rotation=0)

# Pie chart
df['approved'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%', labels=['Rejected', 'Approved'])
ax[1].set_title('Permit Approval Proportion')
ax[1].set_ylabel('')

plt.tight_layout()
plt.show()

## 4. Geographic Analysis

In [None]:
# Approval rates by province
province_stats = df.groupby('province').agg({
    'approved': ['count', 'mean']
}).round(3)
province_stats.columns = ['Total Applications', 'Approval Rate']
province_stats = province_stats.sort_values('Approval Rate', ascending=False)

print("Approval Rates by Province:")
print(province_stats)

# Visualize
fig, ax = plt.subplots(figsize=(14, 6))
province_stats['Approval Rate'].plot(kind='bar', ax=ax)
ax.set_title('Approval Rates by Province', fontsize=14, fontweight='bold')
ax.set_xlabel('Province')
ax.set_ylabel('Approval Rate')
ax.axhline(approval_rate, color='red', linestyle='--', label=f'Overall Average ({approval_rate:.2%})')
ax.legend()
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Mining Type and Mineral Analysis

In [None]:
# Approval by mining type
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

mining_type_approval = df.groupby('mining_type')['approved'].mean().sort_values(ascending=False)
mining_type_approval.plot(kind='bar', ax=ax[0])
ax[0].set_title('Approval Rate by Mining Type')
ax[0].set_ylabel('Approval Rate')
ax[0].axhline(approval_rate, color='red', linestyle='--', alpha=0.5)

# Approval by mineral type
mineral_type_approval = df.groupby('mineral_type')['approved'].mean().sort_values(ascending=False)
mineral_type_approval.plot(kind='bar', ax=ax[1])
ax[1].set_title('Approval Rate by Mineral Type')
ax[1].set_ylabel('Approval Rate')
ax[1].axhline(approval_rate, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 6. Environmental Factors

In [None]:
# Compare environmental factors between approved and rejected permits
environmental_cols = [
    'distance_to_water', 
    'distance_to_protected_area', 
    'distance_to_indigenous_land',
    'environmental_assessment_score'
]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(environmental_cols):
    df.boxplot(column=col, by='approved', ax=axes[i])
    axes[i].set_title(f'{col} by Approval Status')
    axes[i].set_xlabel('Approved (0=No, 1=Yes)')
    axes[i].set_ylabel(col)
    plt.sca(axes[i])
    plt.xticks([1, 2], ['Rejected', 'Approved'])

plt.suptitle('')  # Remove the default title
plt.tight_layout()
plt.show()

## 7. Company Factors

In [None]:
# Company size impact
company_approval = df.groupby('company_size')['approved'].mean().sort_values(ascending=False)

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

company_approval.plot(kind='bar', ax=ax[0])
ax[0].set_title('Approval Rate by Company Size')
ax[0].set_ylabel('Approval Rate')
ax[0].axhline(approval_rate, color='red', linestyle='--', alpha=0.5)

# Compliance history vs approval
df.boxplot(column='company_compliance_history', by='approved', ax=ax[1])
ax[1].set_title('Company Compliance History by Approval Status')
ax[1].set_xlabel('Approved (0=No, 1=Yes)')
ax[1].set_ylabel('Compliance History Score')
plt.sca(ax[1])
plt.xticks([1, 2], ['Rejected', 'Approved'])

plt.tight_layout()
plt.show()

## 8. Correlation Analysis

In [None]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Plot heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Features most correlated with approval
approval_corr = correlation_matrix['approved'].sort_values(ascending=False)
print("\nFeatures most correlated with approval:")
print(approval_corr)

## 9. Project Scale Analysis

In [None]:
# Scatter plots for project characteristics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Project area vs approval
for approved in [0, 1]:
    data = df[df['approved'] == approved]
    axes[0].scatter(data['project_area'], data['estimated_duration'], 
                   alpha=0.5, label=f"Approved={approved}")
axes[0].set_xlabel('Project Area (hectares)')
axes[0].set_ylabel('Estimated Duration (years)')
axes[0].set_title('Project Size vs Duration')
axes[0].legend()

# Employment vs approval
for approved in [0, 1]:
    data = df[df['approved'] == approved]
    axes[1].scatter(data['expected_employment'], data['project_area'], 
                   alpha=0.5, label=f"Approved={approved}")
axes[1].set_xlabel('Expected Employment')
axes[1].set_ylabel('Project Area (hectares)')
axes[1].set_title('Employment vs Project Area')
axes[1].legend()

# Public opposition vs approval
df.boxplot(column='public_opposition_percentage', by='approved', ax=axes[2])
axes[2].set_xlabel('Approved (0=No, 1=Yes)')
axes[2].set_ylabel('Public Opposition (%)')
axes[2].set_title('Public Opposition by Approval Status')
plt.sca(axes[2])
plt.xticks([1, 2], ['Rejected', 'Approved'])

plt.tight_layout()
plt.show()

## 10. Key Insights Summary

Based on the exploratory analysis, document key insights that could inform model development:

In [None]:
print("KEY INSIGHTS FROM EXPLORATION:\n")
print("1. APPROVAL RATES:")
print(f"   - Overall approval rate: {approval_rate:.2%}")
print(f"   - Highest approval rate province: {province_stats.index[0]} ({province_stats.iloc[0]['Approval Rate']:.2%})")
print(f"   - Lowest approval rate province: {province_stats.index[-1]} ({province_stats.iloc[-1]['Approval Rate']:.2%})")

print("\n2. IMPORTANT FEATURES:")
top_corr = approval_corr[approval_corr.index != 'approved'].head(5)
for feature, corr in top_corr.items():
    print(f"   - {feature}: {corr:.3f} correlation with approval")

print("\n3. RECOMMENDATIONS FOR MODELING:")
print("   - Consider province-specific features or interactions")
print("   - Environmental factors show clear patterns")
print("   - Company compliance history is important")
print("   - Public sentiment metrics are valuable")
print("   - Distance to protected areas/water bodies are key features")

## Next Steps

1. **Feature Engineering**: Create additional features based on insights
2. **Data Preprocessing**: Clean and prepare data for modeling
3. **Model Development**: Train and evaluate multiple models
4. **Model Interpretation**: Use SHAP values to explain predictions