# Turkish Earthquake Clustering Analysis

This notebook implements unsupervised learning techniques to identify earthquake patterns and risk zones using our processed AFAD earthquake dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap, MarkerCluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go
import joblib
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Set visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

# Load the clean earthquake dataset with original coordinates
earthquake_df = pd.read_csv('clean_earthquake_data.csv')

# Verify coordinate range
print(f"Dataset shape: {earthquake_df.shape}")
print(f"Coordinate ranges:")
print(f"Longitude: {earthquake_df['Longitude'].min():.2f} to {earthquake_df['Longitude'].max():.2f}")
print(f"Latitude: {earthquake_df['Latitude'].min():.2f} to {earthquake_df['Latitude'].max():.2f}")

earthquake_df.head()

## 1. Data Preparation for Clustering

Selecting and preparing relevant features for clustering algorithms.

In [None]:
# Select features for clustering
# Geographic and physical features are most relevant for spatial clustering
clustering_features = ['Longitude', 'Latitude', 'Depth', 'Magnitude']

# Create a subset of data for clustering
cluster_data = earthquake_df[clustering_features].copy()

# Check for missing values
print("Missing values in clustering features:")
print(cluster_data.isnull().sum())

# Fill any missing values if needed
cluster_data.fillna(cluster_data.median(), inplace=True)

# Standardize features for clustering algorithms
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)
scaled_df = pd.DataFrame(scaled_data, columns=clustering_features)

print("Data prepared for clustering:")
scaled_df.describe()

## 2. K-Means Clustering

Using K-Means to identify distinct earthquake zones based on location and characteristics.

In [None]:
# Determine optimal number of clusters using the Elbow Method
inertia = []
silhouette_scores = []
k_range = range(2, 12)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)
    
    # Compute silhouette score
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(scaled_data, labels)
    silhouette_scores.append(silhouette_avg)
    
    print(f"K={k}, Inertia={kmeans.inertia_:.2f}, Silhouette Score={silhouette_avg:.3f}")

# Plot the Elbow Method results
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, inertia, 'o-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, 'o-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal k')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Select the optimal k based on the elbow method and silhouette score
optimal_k = 5  # You should adjust this based on the plots

# Apply K-Means with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(scaled_data)

# Add cluster labels to the original dataframe
earthquake_df['KMeans_Cluster'] = cluster_labels

# Display the distribution of clusters
plt.figure(figsize=(10, 6))
cluster_counts = earthquake_df['KMeans_Cluster'].value_counts().sort_index()
cluster_counts.plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Number of Earthquakes')
plt.title('Distribution of Earthquakes Across K-Means Clusters')
plt.grid(True, alpha=0.3)
plt.show()

## 3. Visualizing K-Means Clusters

Exploring the geographical and characteristic distribution of the identified clusters.

In [None]:
# Analyze cluster characteristics
cluster_analysis = earthquake_df.groupby('KMeans_Cluster').agg({
    'Longitude': 'mean',
    'Latitude': 'mean',
    'Depth': 'mean',
    'Magnitude': 'mean',
    'KMeans_Cluster': 'count'
}).rename(columns={'KMeans_Cluster': 'Count'})

print("Cluster characteristics:")
print(cluster_analysis)

# Create K-means map visualization with folium - FIXED coordinates
kmeans_map = folium.Map(location=[38.5, 35.5], zoom_start=6)

# Create a discrete color map for clusters
cluster_colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'darkblue', 'cadetblue']

# Add clusters as markers, ensuring coordinates are within Turkey's boundaries
for idx, row in earthquake_df.sample(min(5000, len(earthquake_df))).iterrows():
    cluster_idx = int(row['KMeans_Cluster']) % len(cluster_colors)
    color = cluster_colors[cluster_idx]
    
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],  # Latitude first, Longitude second
        radius=3 + (row['Magnitude'] - 4)/2,  # Size based on magnitude
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f"Cluster: {row['KMeans_Cluster']}<br>Magnitude: {row['Magnitude']}"
    ).add_to(kmeans_map)

# Add cluster centers as larger markers
for cluster_id, group in earthquake_df.groupby('KMeans_Cluster'):
    center_lat = group['Latitude'].mean()
    center_lon = group['Longitude'].mean()
    cluster_idx = int(cluster_id) % len(cluster_colors)
    color = cluster_colors[cluster_idx]
    
    folium.CircleMarker(
        location=[center_lat, center_lon],  # Latitude first, Longitude second
        radius=8,
        color='black',
        fill=True,
        fill_color=color,
        fill_opacity=0.9,
        popup=f"Cluster Center {cluster_id}"
    ).add_to(kmeans_map)

# Add better tile layer
folium.TileLayer('cartodbpositron').add_to(kmeans_map)

# Save the map
kmeans_map.save('kmeans_clusters_map.html')
print("K-means cluster map saved as 'kmeans_clusters_map.html'")

# Also create a Plotly version for the notebook - FIXED
fig_kmeans = px.scatter_mapbox(
    earthquake_df.sample(min(3000, len(earthquake_df))), 
    lat='Latitude',  # Ensure correct parameter for latitude 
    lon='Longitude',  # Ensure correct parameter for longitude
    color='KMeans_Cluster',
    size='Magnitude',
    color_continuous_scale=px.colors.qualitative.Bold,
    size_max=15,
    zoom=5,
    center={"lat": 38.5, "lon": 35.5},  # Centered on Turkey
    mapbox_style="open-street-map",
    title='K-Means Clusters of Turkish Earthquakes',
    hover_data=['Depth', 'Magnitude', 'KMeans_Cluster']
)
fig_kmeans.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig_kmeans.write_html('kmeans_clusters_map_plotly.html')

# Create a 3D scatter plot of clusters - FIXED
fig = px.scatter_3d(
    earthquake_df, 
    x='Longitude', 
    y='Latitude', 
    z='Depth',
    color='KMeans_Cluster', 
    size='Magnitude',
    color_continuous_scale=px.colors.qualitative.G10,
    title='3D Visualization of Earthquake Clusters'
)
# Ensure proper axis orientation and labels
fig.update_layout(scene=dict(
    xaxis_title='Longitude',
    yaxis_title='Latitude',
    zaxis_title='Depth (km)',
    zaxis=dict(autorange="reversed")  # Reverse depth axis so deeper is lower
))
fig.write_html('kmeans_clusters_3d.html')

# Create a pairplot to visualize cluster separation
plt.figure(figsize=(12, 10))
sns.pairplot(earthquake_df[clustering_features + ['KMeans_Cluster']], 
             hue='KMeans_Cluster', palette='viridis')
plt.suptitle('Pairwise Feature Relationships by Cluster', y=1.02)
plt.tight_layout()
plt.show()

## 4. DBSCAN Clustering

Using DBSCAN to identify dense clusters and potential outliers in earthquake data.

In [None]:
# Apply DBSCAN clustering
# We need to find appropriate epsilon and min_samples values
from sklearn.neighbors import NearestNeighbors

# Determine epsilon using k-distance graph
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(scaled_data)
distances, indices = neighbors_fit.kneighbors(scaled_data)

# Sort distances for k-distance graph
distances = np.sort(distances, axis=0)
distances = distances[:, 19]  # 20th neighbor

# Plot k-distance graph
plt.figure(figsize=(12, 6))
plt.plot(distances)
plt.xlabel('Points sorted by distance')
plt.ylabel('20th Nearest Neighbor Distance')
plt.title('K-Distance Graph for DBSCAN Epsilon Selection')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Based on the k-distance plot, choose an appropriate epsilon
# Look for the "elbow" in the plot
epsilon = 0.5  # Adjust based on the plot
min_samples = 20  # Minimum neighbors for a core point

# Apply DBSCAN with chosen parameters
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(scaled_data)

# Add DBSCAN labels to the dataframe
earthquake_df['DBSCAN_Cluster'] = dbscan_labels

# Count the number of clusters and noise points
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

print(f"DBSCAN found {n_clusters} clusters and {n_noise} noise points.")
print(f"Percentage of noise points: {n_noise / len(dbscan_labels) * 100:.2f}%")

# Display the distribution of DBSCAN clusters
plt.figure(figsize=(12, 6))
cluster_counts = earthquake_df['DBSCAN_Cluster'].value_counts().sort_index()
cluster_counts.plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Number of Earthquakes')
plt.title('Distribution of Earthquakes Across DBSCAN Clusters')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Visualizing DBSCAN Clusters

Mapping the DBSCAN clusters to identify high-density earthquake zones and outliers.

In [None]:
# Create DBSCAN map visualization with folium
dbscan_map = folium.Map(location=[38.5, 35.5], zoom_start=6)

# Number of clusters excluding noise
num_clusters = len(set(earthquake_df['DBSCAN_Cluster'])) - (1 if -1 in earthquake_df['DBSCAN_Cluster'].values else 0)
colormap = cm.get_cmap('tab20', max(num_clusters + 1, 2))  # +1 for noise points

# Function to get color for cluster
def get_cluster_color(cluster_id):
    if cluster_id == -1:  # Noise points
        return '#000000'  # Black
    else:
        rgba = colormap(cluster_id % max(num_clusters, 1))
        return mcolors.rgb2hex(rgba)

# Add data points with proper colors (sample for performance)
for idx, row in earthquake_df.sample(min(5000, len(earthquake_df))).iterrows():
    cluster_id = int(row['DBSCAN_Cluster'])
    color = get_cluster_color(cluster_id)
    
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],  # Latitude first, then Longitude
        radius=3 + (row['Magnitude'] - 4)/2,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7 if cluster_id != -1 else 0.3,  # Make noise points more transparent
        popup=f"Cluster: {cluster_id}<br>Magnitude: {row['Magnitude']}"
    ).add_to(dbscan_map)

# Add better tile layer
folium.TileLayer('cartodbpositron').add_to(dbscan_map)

# Save the map
dbscan_map.save('dbscan_clusters_map.html')
print("DBSCAN cluster map saved as 'dbscan_clusters_map.html'")

# Also create a Plotly version for the notebook - FIXED
fig_dbscan = px.scatter_mapbox(
    earthquake_df.sample(min(3000, len(earthquake_df))), 
    lat='Latitude',  # Correct parameter for latitude
    lon='Longitude',  # Correct parameter for longitude
    color='DBSCAN_Cluster',
    size='Magnitude',
    color_continuous_scale=px.colors.qualitative.Dark24,
    size_max=15,
    zoom=5,
    center={"lat": 38.5, "lon": 35.5},  # Centered on Turkey
    mapbox_style="open-street-map",
    title='DBSCAN Clusters of Turkish Earthquakes',
    hover_data=['Depth', 'Magnitude', 'DBSCAN_Cluster']
)
fig_dbscan.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig_dbscan.write_html('dbscan_clusters_map_plotly.html')

# Create density heatmap with Plotly - FIXED
fig_density = px.density_mapbox(
    earthquake_df, 
    lat='Latitude',  # Correct parameter for latitude
    lon='Longitude',  # Correct parameter for longitude
    z='Magnitude',
    radius=10,
    zoom=5, 
    center={"lat": 38.5, "lon": 35.5},  # Centered on Turkey
    mapbox_style="open-street-map",
    title='Earthquake Density Heatmap'
)
fig_density.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig_density.write_html('earthquake_density_map.html')

# Analyze DBSCAN cluster characteristics
dbscan_analysis = earthquake_df[earthquake_df['DBSCAN_Cluster'] != -1].groupby('DBSCAN_Cluster').agg({
    'Longitude': 'mean',
    'Latitude': 'mean',
    'Depth': 'mean',
    'Magnitude': 'mean',
    'DBSCAN_Cluster': 'count'
}).rename(columns={'DBSCAN_Cluster': 'Count'})

print("DBSCAN cluster characteristics (excluding noise):")
print(dbscan_analysis)

# Noise point characteristics
noise_points = earthquake_df[earthquake_df['DBSCAN_Cluster'] == -1]
print("\nNoise point characteristics:")
print(noise_points[['Longitude', 'Latitude', 'Depth', 'Magnitude']].describe())

# Create a 3D scatter plot of DBSCAN clusters - FIXED
fig = px.scatter_3d(
    earthquake_df, 
    x='Longitude', 
    y='Latitude', 
    z='Depth',
    color='DBSCAN_Cluster', 
    size='Magnitude',
    color_continuous_scale=px.colors.qualitative.G10,
    title='3D Visualization of DBSCAN Earthquake Clusters'
)
# Ensure proper axis orientation and labels
fig.update_layout(scene=dict(
    xaxis_title='Longitude',
    yaxis_title='Latitude',
    zaxis_title='Depth (km)',
    zaxis=dict(autorange="reversed")  # Reverse depth axis so deeper is lower
))
fig.write_html('dbscan_clusters_3d.html')

## 6. Dimensionality Reduction with PCA

Using PCA to visualize high-dimensional earthquake data in 2D space.

In [None]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Create a DataFrame with PCA results
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
pca_df['KMeans_Cluster'] = earthquake_df['KMeans_Cluster']
pca_df['DBSCAN_Cluster'] = earthquake_df['DBSCAN_Cluster']
pca_df['Magnitude'] = earthquake_df['Magnitude']

# Explained variance ratio
print("Explained variance ratio:")
print(pca.explained_variance_ratio_)
print(f"Total explained variance: {sum(pca.explained_variance_ratio_) * 100:.2f}%")

# Visualize PCA results with K-Means clusters
plt.figure(figsize=(12, 10))
plt.subplot(2, 1, 1)
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='KMeans_Cluster', palette='viridis', 
                size='Magnitude', sizes=(20, 200), alpha=0.6)
plt.title('PCA of Earthquake Data with K-Means Clusters')
plt.grid(True, alpha=0.3)

# Visualize PCA results with DBSCAN clusters
plt.subplot(2, 1, 2)
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='DBSCAN_Cluster', palette='viridis', 
                size='Magnitude', sizes=(20, 200), alpha=0.6)
plt.title('PCA of Earthquake Data with DBSCAN Clusters')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Feature loadings on principal components
loadings = pd.DataFrame(
    pca.components_.T, 
    columns=['PC1', 'PC2'],
    index=clustering_features
)

print("\nPCA Feature Loadings:")
print(loadings)

# Visualize the feature loadings
plt.figure(figsize=(10, 8))
plt.bar(loadings.index, loadings['PC1'], alpha=0.7, label='PC1')
plt.bar(loadings.index, loadings['PC2'], alpha=0.7, label='PC2')
plt.xlabel('Features')
plt.ylabel('Loading Strength')
plt.title('PCA Feature Loadings')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Risk Zone Identification

Using clustering results to identify high-risk earthquake zones.

In [None]:
# Combine clustering information with magnitude to identify risk zones
earthquake_df['Risk_Score'] = 0

# Calculate risk score based on:
# 1. Magnitude
# 2. Cluster density
# 3. Historical frequency

# Scale magnitude to 0-10 range (assuming most are between 4-8)
earthquake_df['Magnitude_Score'] = (earthquake_df['Magnitude'] - 4) * 2.5
earthquake_df['Magnitude_Score'] = earthquake_df['Magnitude_Score'].clip(0, 10)

# Calculate density of clusters (earthquakes per area)
# First, verify we have valid coordinate ranges to avoid division by zero
kmeans_spatial_range = earthquake_df.groupby('KMeans_Cluster').agg({
    'Longitude': lambda x: max(x) - min(x),
    'Latitude': lambda x: max(x) - min(x)
})

# Ensure we don't divide by zero by adding a small value
kmeans_spatial_range = kmeans_spatial_range + 0.001  # Add small value to prevent division by zero

# Now calculate density safely
kmeans_density = earthquake_df.groupby('KMeans_Cluster').size() / kmeans_spatial_range.prod(axis=1)

# Normalize density to 0-10 scale
max_density = kmeans_density.max()
if max_density > 0:  # Avoid division by zero
    normalized_density = (kmeans_density / max_density) * 10
else:
    normalized_density = kmeans_density * 0  # All zeros if max_density is 0

# Map density scores back to dataframe
density_map = dict(zip(normalized_density.index, normalized_density.values))
earthquake_df['Density_Score'] = earthquake_df['KMeans_Cluster'].map(density_map)

# Calculate final risk score (weighted average)
earthquake_df['Risk_Score'] = (
    earthquake_df['Magnitude_Score'] * 0.5 + 
    earthquake_df['Density_Score'] * 0.5
)

# Classify risk zones
earthquake_df['Risk_Zone'] = pd.cut(
    earthquake_df['Risk_Score'], 
    bins=[0, 3, 6, 10], 
    labels=['Low', 'Medium', 'High']
)

# Display risk zone distribution
risk_distribution = earthquake_df['Risk_Zone'].value_counts()
print("Risk Zone Distribution:")
print(risk_distribution)

plt.figure(figsize=(10, 6))
risk_distribution.plot(kind='bar', color=['green', 'yellow', 'red'])
plt.title('Distribution of Earthquake Risk Zones')
plt.xlabel('Risk Zone')
plt.ylabel('Number of Earthquakes')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Create a risk zone map with folium - FIXED
risk_map = folium.Map(location=[38.5, 35.5], zoom_start=6)

# Color mapping for risk zones
risk_colors = {
    'Low': 'green',
    'Medium': 'orange',
    'High': 'red'
}

# Create separate marker clusters for each risk zone
marker_clusters = {
    'Low': folium.plugins.MarkerCluster(name='Low Risk').add_to(risk_map),
    'Medium': folium.plugins.MarkerCluster(name='Medium Risk').add_to(risk_map),
    'High': folium.plugins.MarkerCluster(name='High Risk').add_to(risk_map)
}

# Sample data points to avoid performance issues
sampled_data = earthquake_df.sample(min(5000, len(earthquake_df)))

# Add markers colored by risk zone
for idx, row in sampled_data.iterrows():
    risk_zone = row['Risk_Zone']
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],  # Latitude first, then Longitude
        radius=3 + (row['Magnitude'] - 4)/2,
        color=risk_colors[risk_zone],
        fill=True,
        fill_color=risk_colors[risk_zone],
        fill_opacity=0.7,
        popup=f"Risk: {risk_zone}<br>Score: {row['Risk_Score']:.2f}<br>Magnitude: {row['Magnitude']}"
    ).add_to(marker_clusters[risk_zone])

# Add better tile layer
folium.TileLayer('cartodbpositron').add_to(risk_map)

# Add layer control
folium.LayerControl().add_to(risk_map)

# Save risk map to HTML file
risk_map.save('earthquake_risk_map.html')
print("Risk zone map saved as 'earthquake_risk_map.html'")

## 8. Conclusion and Findings

Summary of unsupervised learning insights and risk zone identification results.

In [None]:
# Save the clustering results
earthquake_df.to_csv('earthquake_clusters.csv', index=False)

# Summarize cluster and risk zone characteristics
cluster_risk_summary = earthquake_df.groupby(['KMeans_Cluster', 'Risk_Zone']).agg({
    'Magnitude': ['mean', 'min', 'max', 'count'],
    'Depth': ['mean', 'min', 'max'],
    'Risk_Score': ['mean', 'min', 'max']
})

print("Cluster and Risk Zone Summary:")
print(cluster_risk_summary)

# Final visualization: Risk zones by longitude/latitude - FIXED
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    earthquake_df['Longitude'],  # X-axis is Longitude
    earthquake_df['Latitude'],   # Y-axis is Latitude
    c=earthquake_df['Risk_Score'],
    cmap='RdYlGn_r',  # Red for high risk, green for low risk
    alpha=0.7,
    s=20
)
plt.colorbar(scatter, label='Risk Score')
plt.title('Earthquake Risk Zones in Turkey')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('earthquake_risk_zones.png', dpi=300)
plt.show()

print("Unsupervised learning analysis completed!")