## 1. Data Loading and Initial Exploration

First, we'll load the dataset and examine its basic structure and properties.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from datetime import datetime
import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point, LineString
import math

# Set visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

# Load the earthquake dataset
earthquake_df = pd.read_csv('UI\\earthquake_data.csv')

# Load fault line data
fault_gdf = gpd.read_file('UI\\tr_faults_imp.geojson')
print(f"Number of fault lines: {len(fault_gdf)}")
print(f"Available properties: {fault_gdf.columns.tolist()}")

# Display first few rows to understand the structure
earthquake_df.head()

In [None]:
# Basic information about the dataset
print(f"Dataset shape: {earthquake_df.shape}")
print(f"Number of earthquakes: {len(earthquake_df)}")
print("\nData types:")
print(earthquake_df.dtypes)

# Check for missing values
print("\nMissing values:")
print(earthquake_df.isnull().sum())

# Basic statistics
print("\nBasic statistics:")
earthquake_df.describe()

## 2. Exploratory Data Analysis (EDA)

### 2.1 Temporal Feature Creation

Converting date information to structured temporal features for analysis.

In [None]:
# Convert Date column to datetime format with explicit format
earthquake_df['Date'] = pd.to_datetime(earthquake_df['Date'], format="%d/%m/%Y %H:%M:%S", errors='coerce')

# Check if any dates couldn't be parsed
null_dates = earthquake_df['Date'].isnull().sum()
print(f"Number of dates that couldn't be parsed: {null_dates}")

# If we have null dates, we can try alternate formats
if null_dates > 0:
    print("Trying alternative date formats...")
    # Try another common format
    earthquake_df['Date'] = pd.to_datetime(earthquake_df['Date'], format="%d-%m-%Y %H:%M:%S", errors='coerce')
    # If still having issues, try auto-detection with dayfirst=True
    if earthquake_df['Date'].isnull().sum() > 0:
        earthquake_df['Date'] = pd.to_datetime(earthquake_df['Date'], dayfirst=True, errors='coerce')
    
    print(f"Remaining null dates after fixes: {earthquake_df['Date'].isnull().sum()}")

# Create additional time-based features
earthquake_df['Year'] = earthquake_df['Date'].dt.year
earthquake_df['Month'] = earthquake_df['Date'].dt.month
earthquake_df['Day'] = earthquake_df['Date'].dt.day
earthquake_df['DayOfWeek'] = earthquake_df['Date'].dt.dayofweek
earthquake_df['Season'] = earthquake_df['Month'].apply(lambda x: 
                                                     'Winter' if x in [12, 1, 2] else
                                                     'Spring' if x in [3, 4, 5] else
                                                     'Summer' if x in [6, 7, 8] else
                                                     'Fall')

# Display the updated dataframe
earthquake_df.head()

### 2.2 Geographic Visualization

Creating maps to visualize the earthquake distribution across Turkey.

In [None]:
# First check and clean coordinate data
print("Coordinate ranges before cleaning:")
print(f"Longitude: {earthquake_df['Longitude'].min()} to {earthquake_df['Longitude'].max()}")
print(f"Latitude: {earthquake_df['Latitude'].min()} to {earthquake_df['Latitude'].max()}")

# Filter out any extreme outliers (coordinates that are clearly wrong)
# Turkey coordinates should be roughly: Longitude 26-45 E, Latitude 36-42 N
valid_coords = (
    (earthquake_df['Longitude'] >= 25) & 
    (earthquake_df['Longitude'] <= 45) & 
    (earthquake_df['Latitude'] >= 35) & 
    (earthquake_df['Latitude'] <= 43)
)

# Filter the dataframe to keep only valid coordinates
clean_df = earthquake_df[valid_coords].copy()
outliers_removed = len(earthquake_df) - len(clean_df)
print(f"Removed {outliers_removed} records with coordinates outside Turkey's boundaries")

print("Coordinate ranges after cleaning:")
print(f"Longitude: {clean_df['Longitude'].min()} to {clean_df['Longitude'].max()}")
print(f"Latitude: {clean_df['Latitude'].min()} to {clean_df['Latitude'].max()}")

# Create a map centered on Turkey
turkey_map = folium.Map(location=[38.5, 35.5], zoom_start=6)

# Sample points for better visualization performance
sample_df = clean_df.sample(min(2000, len(clean_df)))

# Create a heatmap layer with cleaned data
heat_data = [[row['Latitude'], row['Longitude']] for index, row in sample_df.iterrows()]
HeatMap(heat_data, radius=8, gradient={'0.4': 'blue', '0.6': 'cyan', '0.8': 'yellow', '1.0': 'red'}).add_to(turkey_map)

# Add markers for strong earthquakes (magnitude > 6)
for idx, row in clean_df[clean_df['Magnitude'] > 6].iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=row['Magnitude'] * 1.5,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        popup=f"Magnitude: {row['Magnitude']}<br>Date: {row['Date']}<br>Location: {row['Location']}",
    ).add_to(turkey_map)

# Add fault lines to the map
def add_faults_to_map(map_obj, fault_gdf, importance_threshold=0):
    # Filter faults by importance if desired
    if importance_threshold > 0:
        fault_data = fault_gdf[fault_gdf['importance'] >= importance_threshold]
    else:
        fault_data = fault_gdf
    
    # Color by importance
    def style_function(feature):
        importance = feature['properties']['importance']
        color = '#FF0000' if importance >= 4 else '#FFA500' if importance >= 3 else '#FFFF00'
        return {
            'color': color,
            'weight': importance * 0.5,  # Thicker lines for more important faults
            'opacity': 0.7
        }
    
    # Add GeoJSON to map
    folium.GeoJson(
        fault_data,
        name='Fault Lines',
        style_function=style_function,
        tooltip=folium.GeoJsonTooltip(fields=['FAULT_NAME', 'importance']),
    ).add_to(map_obj)
    
    return map_obj

# Add fault lines to the map
turkey_map = add_faults_to_map(turkey_map, fault_gdf, importance_threshold=3)

# Add a tile layer for better visualization
folium.TileLayer('cartodbpositron').add_to(turkey_map)

# Save map to HTML file to view it
turkey_map.save('earthquake_map.html')

# Display in notebook if you have ipywidgets installed
# from IPython.display import display
# display(turkey_map)

### 2.3 Temporal Analysis

Analyzing patterns in earthquake frequency over time (yearly, monthly, seasonal).

In [None]:
# Use the cleaned dataframe for temporal analysis
# Yearly earthquake frequency
yearly_counts = clean_df.groupby('Year').size()

plt.figure(figsize=(14, 6))
yearly_counts.plot(kind='bar')
plt.title('Yearly Earthquake Frequency')
plt.xlabel('Year')
plt.ylabel('Number of Earthquakes')
plt.tight_layout()
plt.show()

# Seasonal patterns
seasonal_counts = clean_df.groupby('Season').size()

plt.figure(figsize=(10, 6))
seasonal_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Seasonal Distribution of Earthquakes')
plt.ylabel('')
plt.tight_layout()
plt.show()

# Monthly patterns
monthly_counts = clean_df.groupby('Month').size()

plt.figure(figsize=(14, 6))
monthly_counts.plot(kind='bar')
plt.title('Monthly Earthquake Frequency')
plt.xlabel('Month')
plt.ylabel('Number of Earthquakes')
plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.tight_layout()
plt.show()

### 2.4 Magnitude and Depth Analysis

Examining the distribution of earthquake magnitudes and depths, and their relationship.

In [None]:
# Magnitude distribution
plt.figure(figsize=(12, 6))
sns.histplot(clean_df['Magnitude'], bins=30, kde=True)
plt.title('Distribution of Earthquake Magnitudes')
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.axvline(clean_df['Magnitude'].mean(), color='red', linestyle='--', label=f'Mean: {clean_df["Magnitude"].mean():.2f}')
plt.legend()
plt.tight_layout()
plt.show()

# Depth distribution
plt.figure(figsize=(12, 6))
sns.histplot(clean_df['Depth'], bins=30, kde=True)
plt.title('Distribution of Earthquake Depths')
plt.xlabel('Depth (km)')
plt.ylabel('Frequency')
plt.axvline(clean_df['Depth'].mean(), color='red', linestyle='--', label=f'Mean: {clean_df["Depth"].mean():.2f}')
plt.legend()
plt.tight_layout()
plt.show()

# Relationship between magnitude and depth
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Depth', y='Magnitude', data=clean_df, alpha=0.6)
plt.title('Relationship Between Earthquake Depth and Magnitude')
plt.xlabel('Depth (km)')
plt.ylabel('Magnitude')
plt.tight_layout()
plt.show()

### 2.5 Correlation Analysis

Exploring correlations between numerical features.

In [None]:
# Correlation analysis of numerical columns
numerical_cols = clean_df.select_dtypes(include=[np.number]).columns
correlation_matrix = clean_df[numerical_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

### 2.6 Additional Visualizations

Further exploration of earthquake patterns through geographic, temporal, and magnitude-based visualizations.

In [None]:
# Geographic distribution by magnitude
plt.figure(figsize=(14, 10))
scatter = plt.scatter(clean_df['Longitude'], clean_df['Latitude'], 
                     c=clean_df['Magnitude'], cmap='YlOrRd', 
                     alpha=0.7, s=clean_df['Magnitude']**2)
plt.colorbar(scatter, label='Magnitude')
plt.title('Geographic Distribution of Earthquakes by Magnitude')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Magnitude distribution over years (box plot)
plt.figure(figsize=(16, 8))
sns.boxplot(x='Year', y='Magnitude', data=clean_df)
plt.title('Magnitude Distribution Over Years')
plt.xlabel('Year')
plt.ylabel('Magnitude')
plt.xticks(rotation=90)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Depth vs Year analysis
plt.figure(figsize=(16, 8))
sns.boxplot(x='Year', y='Depth', data=clean_df)
plt.title('Depth Distribution Over Years')
plt.xlabel('Year')
plt.ylabel('Depth (km)')
plt.xticks(rotation=90)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# 3D visualization with Plotly
import plotly.express as px

fig = px.scatter_3d(clean_df.sample(min(3000, len(clean_df))), 
                   x='Longitude', y='Latitude', z='Depth',
                   color='Magnitude', size='Magnitude',
                   color_continuous_scale='Viridis',
                   title='3D Visualization of Earthquakes')
# Ensure proper axis orientation
fig.update_layout(scene=dict(
    xaxis_title='Longitude',
    yaxis_title='Latitude',
    zaxis_title='Depth (km)',
    # Reverse the depth axis to show deeper earthquakes lower
    zaxis=dict(autorange="reversed")
))
fig.write_html('earthquake_3d.html')  # Save the interactive plot
# fig.show()  # Display in notebook if supported

In [None]:
# Magnitude frequency plot
plt.figure(figsize=(12, 6))
counts, bins, _ = plt.hist(clean_df['Magnitude'], bins=30, alpha=0.7)
plt.plot(bins[:-1], counts, '-o', color='darkred')
plt.title('Frequency Distribution of Earthquake Magnitudes')
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Regional magnitude comparison
# Extract region from location (assuming format includes region at end)
# Modify this based on your actual data format
if 'Location' in clean_df.columns:
    # Extract the first part of the location as the region
    clean_df['Region'] = clean_df['Location'].str.split(',').str[-1].str.strip()
    
    # Get top 10 regions by earthquake count
    top_regions = clean_df['Region'].value_counts().head(10).index
    
    # Plot magnitude distribution by region
    plt.figure(figsize=(14, 8))
    sns.boxplot(x='Region', y='Magnitude', data=clean_df[clean_df['Region'].isin(top_regions)])
    plt.title('Magnitude Distribution by Top 10 Regions')
    plt.xlabel('Region')
    plt.ylabel('Magnitude')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Heatmap of earthquake frequency by month and year
if len(clean_df) > 0:
    # Create pivot table
    heatmap_data = pd.pivot_table(
        clean_df,
        values='Magnitude',
        index=clean_df['Date'].dt.year,
        columns=clean_df['Date'].dt.month,
        aggfunc='count'
    )
    
    plt.figure(figsize=(14, 10))
    sns.heatmap(heatmap_data, cmap='YlOrRd', annot=False)
    plt.title('Earthquake Frequency by Month and Year')
    plt.xlabel('Month')
    plt.ylabel('Year')
    plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.tight_layout()
    plt.show()

### 2.7 Fault Line Analysis
Examining the relationship between earthquakes and fault lines.

In [None]:
# Calculate distances to fault lines
def calc_fault_distance(row, fault_gdf):
    point = Point(row['Longitude'], row['Latitude'])
    
    # Calculate distance to each fault line
    distances = []
    for idx, fault in fault_gdf.iterrows():
        fault_geom = fault.geometry
        dist = point.distance(fault_geom)
        distances.append((dist, idx))
    
    # Find the closest fault
    closest_dist, closest_idx = min(distances, key=lambda x: x[0])
    
    # Convert distance to kilometers (approximation)
    # 1 degree ≈ 111 km at the equator
    dist_km = closest_dist * 111
    
    # Get fault properties
    closest_fault = fault_gdf.iloc[closest_idx]
    
    return pd.Series({
        'distance_to_fault': dist_km,
        'nearest_fault_name': closest_fault.get('FAULT_NAME', 'Unknown'),
        'nearest_fault_importance': closest_fault.get('importance', 0)
    })

# Apply to a sample for visualization (full calculation will be done later)
sample_size = min(1000, len(clean_df))
fault_distance_sample = clean_df.sample(sample_size).apply(
    lambda row: calc_fault_distance(row, fault_gdf), axis=1
)

# Visualize relationship between earthquake magnitude and distance to fault
plt.figure(figsize=(12, 8))
plt.scatter(fault_distance_sample['distance_to_fault'], 
           clean_df.loc[fault_distance_sample.index, 'Magnitude'],
           alpha=0.6, c=fault_distance_sample['nearest_fault_importance'], 
           cmap='viridis')
plt.colorbar(label='Fault Importance')
plt.xlabel('Distance to Nearest Fault (km)')
plt.ylabel('Magnitude')
plt.title('Relationship Between Earthquake Magnitude and Distance to Fault')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

Handling missing values, outliers, and preparing data for modeling.

In [None]:
# Data Preprocessing Section
print("Starting data preprocessing...")

# Check for missing values again to confirm
missing_values = clean_df.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

# Handle missing values
# For numerical columns: fill with median
numerical_cols = ['Longitude', 'Latitude', 'Depth', 'Magnitude']
for col in numerical_cols:
    if missing_values[col] > 0:
        median_value = clean_df[col].median()
        clean_df[col].fillna(median_value, inplace=True)
        print(f"Filled {missing_values[col]} missing values in {col} with median: {median_value}")

# For categorical columns: fill with mode
categorical_cols = [col for col in clean_df.columns if col not in numerical_cols 
                   and col not in ['Date', 'Year', 'Month', 'Day', 'YearMonth']]
for col in categorical_cols:
    if col in missing_values and missing_values[col] > 0:
        mode_value = clean_df[col].mode()[0]
        clean_df[col].fillna(mode_value, inplace=True)
        print(f"Filled {missing_values[col]} missing values in {col} with mode: {mode_value}")

# Handle outliers using IQR method for depth and magnitude
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"Found {len(outliers)} outliers in {column}")
    
    # Cap outliers instead of removing them
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

# Apply outlier handling to Depth
clean_df = handle_outliers(clean_df, 'Depth')

# For Magnitude, we may want to keep high values as they're important
# But we can still check for potential errors
magnitude_outliers = clean_df[clean_df['Magnitude'] > 8.5]
print(f"Extremely high magnitudes (>8.5): {len(magnitude_outliers)}")
if len(magnitude_outliers) > 0:
    print(magnitude_outliers[['Date', 'Magnitude', 'Location']])

# Standardize coordinates if needed
print("\nCoordinate ranges:")
print(f"Longitude: {clean_df['Longitude'].min()} to {clean_df['Longitude'].max()}")
print(f"Latitude: {clean_df['Latitude'].min()} to {clean_df['Latitude'].max()}")

# Verify coordinates are in the Turkey region (already done in previous step)
# This is now redundant since we've already filtered the coordinates
turkey_coords = clean_df[
    (clean_df['Longitude'] >= 25) & 
    (clean_df['Longitude'] <= 45) & 
    (clean_df['Latitude'] >= 35) & 
    (clean_df['Latitude'] <= 43)
]
outside_turkey = len(clean_df) - len(turkey_coords)
print(f"Records potentially outside Turkey region: {outside_turkey}")

# Normalize numerical features for modeling
from sklearn.preprocessing import StandardScaler

# Create a copy of the dataframe for modeling
model_df = clean_df.copy()

# Select features for scaling
features_to_scale = ['Longitude', 'Latitude', 'Depth']
scaler = StandardScaler()
model_df[features_to_scale] = scaler.fit_transform(model_df[features_to_scale])

print("\nData preprocessing completed!")
model_df.head()

## 4. Feature Engineering

Creating new features to improve model performance, including fault-related features.

In [None]:
# Feature Engineering
print("Starting feature engineering...")

# Create time-based features
model_df['DayOfYear'] = model_df['Date'].dt.dayofyear
model_df['WeekOfYear'] = model_df['Date'].dt.isocalendar().week
model_df['IsWeekend'] = model_df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Encode seasonal information using cyclical encoding
model_df['MonthSin'] = np.sin(2 * np.pi * model_df['Month']/12)
model_df['MonthCos'] = np.cos(2 * np.pi * model_df['Month']/12)
model_df['DayOfYearSin'] = np.sin(2 * np.pi * model_df['DayOfYear']/365)
model_df['DayOfYearCos'] = np.cos(2 * np.pi * model_df['DayOfYear']/365)

# Create regional activity features
# Group by regions and calculate historical earthquake counts
# First, create a spatial grid
lon_grid = pd.cut(clean_df['Longitude'], bins=10)
lat_grid = pd.cut(clean_df['Latitude'], bins=10)
clean_df['Grid'] = pd.Series(zip(lon_grid, lat_grid)).astype(str)

# For each earthquake, count previous earthquakes in the same grid
clean_df = clean_df.sort_values('Date')
clean_df['PrevQuakesInGrid'] = clean_df.groupby('Grid').cumcount()

# Calculate distances between consecutive earthquakes
clean_df['PrevLon'] = clean_df['Longitude'].shift(1)
clean_df['PrevLat'] = clean_df['Latitude'].shift(1)

# Haversine formula to calculate distance in km
from math import radians, sin, cos, sqrt, asin

def haversine(lon1, lat1, lon2, lat2):
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in km
    return c * r

# Apply haversine to calculate distance from previous earthquake
clean_df['DistFromPrev'] = clean_df.apply(
    lambda x: haversine(x['Longitude'], x['Latitude'], x['PrevLon'], x['PrevLat']) 
    if not pd.isna(x['PrevLon']) else np.nan, axis=1)

# Add distance features to model_df
model_df['PrevQuakesInGrid'] = clean_df['PrevQuakesInGrid']
model_df['DistFromPrev'] = clean_df['DistFromPrev']
model_df['DistFromPrev'].fillna(model_df['DistFromPrev'].median(), inplace=True)

# Create feature for time since last earthquake (in days)
clean_df['PrevDate'] = clean_df['Date'].shift(1)
clean_df['DaysSinceLastQuake'] = (clean_df['Date'] - clean_df['PrevDate']).dt.total_seconds() / (24 * 3600)
model_df['DaysSinceLastQuake'] = clean_df['DaysSinceLastQuake']
model_df['DaysSinceLastQuake'].fillna(model_df['DaysSinceLastQuake'].median(), inplace=True)

# Add historical magnitude information
clean_df['PrevMagnitude'] = clean_df['Magnitude'].shift(1)
model_df['PrevMagnitude'] = clean_df['PrevMagnitude']
model_df['PrevMagnitude'].fillna(model_df['PrevMagnitude'].median(), inplace=True)

# Create interaction features
model_df['DepthByLat'] = model_df['Depth'] * model_df['Latitude']
model_df['DepthByLon'] = model_df['Depth'] * model_df['Longitude']

# Add fault-related features - calculate for all data points
print("Calculating fault-related features...")
fault_features = clean_df.apply(lambda row: calc_fault_distance(row, fault_gdf), axis=1)
clean_df = pd.concat([clean_df, fault_features], axis=1)
model_df = pd.concat([model_df, fault_features], axis=1)

# Calculate fault density in a radius
def calc_fault_density(lat, lon, fault_gdf, radius=50):
    """Calculate fault density within radius (km) of a point"""
    point = Point(lon, lat)
    buffer_degrees = radius / 111  # Convert km to approximate degrees
    
    # Create a buffer around the point
    buffer = point.buffer(buffer_degrees)
    
    # Count intersecting faults and sum their lengths
    intersecting_faults = 0
    total_length = 0
    
    for _, fault in fault_gdf.iterrows():
        if buffer.intersects(fault.geometry):
            intersecting_faults += 1
            # Calculate length of intersection
            intersection = buffer.intersection(fault.geometry)
            total_length += intersection.length * 111  # Convert to km
    
    return pd.Series({
        'fault_count_50km': intersecting_faults,
        'fault_length_50km': total_length,
        'fault_density': total_length / (math.pi * radius**2) if radius > 0 else 0
    })

# Calculate fault density for strategic points (grid centers) to avoid heavy computation
print("Calculating fault density (this may take a while)...")
# Create a grid for Turkey
lon_range = np.linspace(25, 45, 10)
lat_range = np.linspace(35, 43, 10)
grid_points = []

for lon in lon_range:
    for lat in lat_range:
        grid_points.append((lon, lat))

# Calculate density at grid points
grid_densities = []
for lon, lat in grid_points:
    density = calc_fault_density(lat, lon, fault_gdf)
    density['lon'] = lon
    density['lat'] = lat
    grid_densities.append(density)

grid_df = pd.DataFrame(grid_densities)

# For each earthquake, find nearest grid point and assign its density
def assign_grid_density(row, grid_df):
    distances = []
    for idx, grid_point in grid_df.iterrows():
        dist = haversine(row['Longitude'], row['Latitude'], grid_point['lon'], grid_point['lat'])
        distances.append((dist, idx))
    
    closest_idx = min(distances, key=lambda x: x[0])[1]
    return pd.Series({
        'fault_count_50km': grid_df.iloc[closest_idx]['fault_count_50km'],
        'fault_length_50km': grid_df.iloc[closest_idx]['fault_length_50km'],
        'fault_density': grid_df.iloc[closest_idx]['fault_density']
    })

# Apply grid-based density estimation
density_features = clean_df.apply(lambda row: assign_grid_density(row, grid_df), axis=1)
clean_df = pd.concat([clean_df, density_features], axis=1)
model_df = pd.concat([model_df, density_features], axis=1)

# Add magnitude-distance interaction feature
model_df['magnitude_fault_interaction'] = model_df['Magnitude'] / (model_df['distance_to_fault'] + 1)

print("Feature engineering completed!")
model_df.head()

## 5. Model Selection and Training

Training and comparing multiple regression models to predict earthquake magnitude.

In [None]:
# Import necessary libraries for modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Model Selection and Training
print("Setting up model training...")

# Define features and target
target = 'Magnitude'
# Remove non-feature columns
drops = ['Date', 'Location', 'EventID', 'TimeName', 'TypeName', 
         'MagnitudeName', 'Grid', 'PrevLon', 'PrevLat', 'PrevDate',
         'nearest_fault_name']  # Remove string columns

# Check if these optional columns exist and add them to drops if they do
optional_drops = ['YearMonth']
for col in optional_drops:
    if col in model_df.columns:
        drops.append(col)

# First, create a preliminary feature list
preliminary_features = [col for col in model_df.columns if col != target and col not in drops]

# Check for non-numeric columns in our features
for col in preliminary_features:
    if col in model_df.columns and model_df[col].dtype == 'object':
        print(f"Removing non-numeric column: {col}")
        drops.append(col)

# Final feature list with only numeric columns
features = [col for col in model_df.columns if col != target and col not in drops]

print(f"Selected features: {features}")

# Split data into training and testing sets
X = model_df[features]
y = model_df[target]

print("Columns with NaN values:")
for col in X.columns:
    nan_count = X[col].isna().sum()
    if nan_count > 0:
        print(f"- {col}: {nan_count} NaNs")

# Fill missing values appropriately for each column
for col in X.columns:
    if X[col].isna().sum() > 0:
        # For numeric columns, use median
        X[col] = X[col].fillna(X[col].median())

# Also check target variable
if y.isna().sum() > 0:
    print(f"Target has {y.isna().sum()} NaN values, filling with median")
    y = y.fillna(y.median())

# Verify all NaNs are fixed
print(f"Remaining NaN values in X: {X.isna().sum().sum()}")
print(f"Remaining NaN values in y: {y.isna().sum()}")

# Create new train-test split with cleaned data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Set up models to try
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    return mae, rmse, r2

# Cross-validation for more robust evaluation
results = {}
cv_results = {}

for name, model in models.items():
    print(f"Training {name}...")
    mae, rmse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    # 5-fold cross-validation for RMSE
    cv_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
    
    results[name] = {'MAE': mae, 'RMSE': rmse, 'R²': r2}
    cv_results[name] = {'Mean RMSE': cv_scores.mean(), 'Std RMSE': cv_scores.std()}
    
    print(f"{name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}, CV RMSE: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Convert results to DataFrames for better visualization
results_df = pd.DataFrame(results).T
cv_results_df = pd.DataFrame(cv_results).T

print("\nTest Results:")
print(results_df.sort_values('RMSE'))

print("\nCross-Validation Results:")
print(cv_results_df.sort_values('Mean RMSE'))

# Visualize model performance
plt.figure(figsize=(12, 6))
results_df['RMSE'].sort_values().plot(kind='bar')
plt.title('RMSE by Model')
plt.ylabel('RMSE (lower is better)')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Select the best performing model based on CV results
best_model_name = cv_results_df.sort_values('Mean RMSE').index[0]
print(f"\nBest model based on cross-validation: {best_model_name}")

## 6. Hyperparameter Optimization

Fine-tuning the best performing model to maximize prediction accuracy.

In [None]:
# Hyperparameter Optimization
print(f"Optimizing hyperparameters for {best_model_name}...")

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define hyperparameter grids for each model type
# You may need to adjust these based on your selected best model
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'colsample_bytree': [0.7, 0.8, 0.9]
    },
    'LightGBM': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'num_leaves': [31, 50, 70]
    }
}

# Get the appropriate parameter grid
if best_model_name in param_grids:
    param_grid = param_grids[best_model_name]
    
    # Use RandomizedSearchCV for efficiency
    random_search = RandomizedSearchCV(
        models[best_model_name], 
        param_distributions=param_grid,
        n_iter=20,  # Number of parameter settings sampled
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    # Print best parameters and score
    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best RMSE: {-random_search.best_score_:.4f}")
    
    # Create the optimized model
    best_model = random_search.best_estimator_
else:
    print(f"No parameter grid defined for {best_model_name}. Using default model.")
    best_model = models[best_model_name]

# Final evaluation with the best model
y_pred = best_model.predict(X_test)
final_mae = mean_absolute_error(y_test, y_pred)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
final_r2 = r2_score(y_test, y_pred)

print(f"\nFinal model performance:")
print(f"MAE: {final_mae:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")

## 7. Model Evaluation and Interpretation

Assessing model performance, analyzing prediction errors, and identifying the most important features.

In [None]:
# Model Evaluation and Interpretation
print("Evaluating final model...")

# Visualize actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Magnitude')
plt.ylabel('Predicted Magnitude')
plt.title('Actual vs Predicted Earthquake Magnitude')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Plot residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Magnitude')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Analyze residual distribution
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    # Create DataFrame of feature importances
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Visualize feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title('Top 15 Feature Importances')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 most important features:")
    print(feature_importance.head(10))

# Save the model
import joblib
joblib.dump(best_model, 'earthquake_magnitude_model.pkl')
print("Model saved as 'earthquake_magnitude_model.pkl'")

# Save the scaler for later use
joblib.dump(scaler, 'earthquake_scaler.pkl')
print("Scaler saved as 'earthquake_scaler.pkl'")

# Also save the clean dataset with original coordinates for unsupervised learning
clean_df.to_csv('clean_earthquake_data.csv', index=False)
print("Clean data with original coordinates saved as 'clean_earthquake_data.csv'")

## 8. Conclusion and Next Steps

The model predicts earthquake magnitudes in Turkey with reasonable accuracy. The processed data has been saved for further unsupervised learning analysis and UI development.

Next steps:
1. Develop clustering models in unsupervised.ipynb
2. Create an interactive UI application
3. Document findings in the project README