In [None]:
# Author: Pierre Jeanne
# Project Name: Earthquakes, 1965-2016
# Date Created: 02 April 2021
# from: https://www.kaggle.com/usgs/earthquake-database

## INTRODUCTION
This dataset includes a record of the date, time, location, depth, magnitude, and source of every earthquake with a reported magnitude 5.5 or higher since 1965.

In [None]:
import numpy as np
import pandas as pd
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import seaborn as sns; sns.set()
# stat on data
from scipy import stats
from scipy.stats import norm, skew

from statsmodels.tsa.arima.model import ARIMA
import cartopy.feature as cfeature

In [None]:
df = pd.read_csv(r"../input/earthquake-database/database.csv")
df.head()

In [None]:
print('Shape of the file')
print('-'*30)
print(df.shape)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>1- Data Cleaning</center></h3>

## 1.1: Verify if NaN values:

In [None]:
def msv1(data, thresh=20, color='black', edgecolor='black', width=15, height=3):
    """
    SOURCE: https://www.kaggle.com/amiiiney/price-prediction-regularization-stacking
    """
    
    plt.figure(figsize=(width,height))
    percentage=(data.isnull().mean())*100
    percentage.sort_values(ascending=False).plot.bar(color=color, edgecolor=edgecolor)
    plt.axhline(y=thresh, color='r', linestyle='-')
    plt.title('Missing values percentage per column', fontsize=20, weight='bold' )
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh+12.5, f'Columns with more than {thresh}% missing values', fontsize=12, color='crimson',
         ha='left' ,va='top')
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh - 5, f'Columns with less than {thresh} missing values', fontsize=12, color='green',
         ha='left' ,va='top')
    plt.xlabel('Columns', size=15, weight='bold')
    plt.ylabel('Missing values percentage')
    
    return plt.show()

In [None]:
msv1(df, 30, color=sns.color_palette('Reds',15))

In [None]:
# drop all the columns with more than 30% of missing values
df = df.dropna(thresh=len(df)*0.70, axis=1)

## 1.2: replace nan values

In [None]:
# create list with columns having NaN values
col_with_NaN_value = df.columns[df.isnull().any()]
# create df with only columns having NaN value 
df_nan = df[col_with_NaN_value]
# count number of NaN value per columns
print('Number of NaN values')
print('-'*30)
df_nan.isnull().sum()

In [None]:
# drop rows with the 3 missing values in 'magnitude type'
df = df.dropna(subset=['Magnitude Type'])

# replace nan value in RMS with mean columns
df['Root Mean Square'] = df['Root Mean Square'].fillna(value=df['Root Mean Square'].mean())

# verify operations:
df.isnull().sum()

# reset index
df = df.reset_index()

## 1.3: Verify dtypes

In [None]:
df.dtypes

### 1.4: Parsing datetime

In [None]:
#exploring the length of date objects
lengths = df["Date"].str.len()
lengths.value_counts()

The dataframe contans 3 rows with wrong dates (24 characters). Let's have a look at these 3 dates.

In [None]:
#having a look at the fishy datapoints
wrongdates = np.where([lengths == 24])[1]
print("Row index with wrong dates:", wrongdates)
df.loc[wrongdates]

In [None]:
#fixing the wrong dates 
df.loc[3378, "Date"] = "02/23/1975"  
df.loc[7510, "Date"] = "04/28/1985"
df.loc[20647, "Date"] = "03/13/2011"

#fixing the wrong Time 
df.loc[3378, "Time"] = "02:58:41"  
df.loc[7510, "Time"] = "02:53:41"
df.loc[20647, "Time"] = "02:23:34"

In [None]:
#exploring the length of time objects
lengths = df["Time"].str.len()
lengths.value_counts()

In [None]:
# datetime conversions merge Date + Time
df['datetime'] = df.Date+' '+df.Time
df['datetime'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M:%S')

# drop irrelevant columns
df = df.drop(['Date', 'Time','index'], axis=1)

### 1.5: Verify Value consistency

In [None]:
# Print unique values for categorical variables:
print('Type: ', df['Type'].unique(), "\n")
print('Magnitude Type', df['Magnitude Type'].unique(), "\n")
print('Source', df['Source'].unique(), "\n")
print('Location Source', df['Location Source'].unique(), "\n")          
print('Magnitude Source', df['Magnitude Source'].unique(), "\n")
print('Status', df['Status'].unique(), "\n")

In [None]:
fig = plt.figure(figsize=(15,10))
fig.subplots_adjust(hspace=0.4,wspace=0.3)

# subplot 1
ax1 = fig.add_subplot(2,2,1)
ax1=df['Type'].value_counts(normalize=True).plot.pie(legend=False,shadow=True, autopct='%1.1f%%' ,
                                                              pctdistance=0.75, radius=1.05, wedgeprops = {'linewidth': 0.1}, 
                                                              textprops = {'fontsize': 12},normalize=False)
ax1.set_title('Type distribution', fontsize=16, weight='bold')
ax1.legend(fancybox=True, shadow=True, title='Type', fontsize=11,loc='upper right', bbox_to_anchor=(1.7, 1))
ax1.set_ylabel("")

# subplot 2
ax2 = fig.add_subplot(2,2,2)
ax2=df['Magnitude Type'].value_counts(normalize=True).plot.pie(legend=False,shadow=True, autopct='%1.1f%%' ,
                                                              pctdistance=0.75, radius=1.05, wedgeprops = {'linewidth': 0.1}, 
                                                              textprops = {'fontsize': 12},normalize=False)

ax2.set_title('Magnitude Type distribution', fontsize=16, weight='bold')
ax2.legend(fancybox=True, shadow=True, title='Mg type', fontsize=11,loc='upper right', bbox_to_anchor=(1.7, 1))
ax2.set_ylabel("")

plt.show()

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>2- Magnitude Conversion and  Earthquake locations</center></h3>

In [None]:
# keep only earthquake
df = df[df['Type']=='Earthquake']

### Homogeneisation of the seismic magnitude
Seismic magnitude scales (Ml, Ms, Md etc.) are used to describe the overall strength or "size" of an earthquake. Magnitudes are usually determined from measurements of an earthquake's seismic waves as recorded on a seismogram. Magnitude scales vary on what aspect of the seismic waves are measured and how they are measured. Different magnitude scales are necessary because of differences in earthquakes, the information available, and the purposes for which the magnitudes are used.

Moment magnitude (Mw) is considered the authoritative magnitude scale for ranking earthquakes by size. It is more directly related to the energy of an earthquake than other scales, and does not saturate—that is, it does not underestimate magnitudes as other scales do in certain conditions. It has become the standard scale used by seismological authorities . Subtypes of the moment magnitude scale (Mww , etc.) reflect different ways of estimating the seismic moment.

Over the year empirical relations where developped to link the Seismic magnitude scales  to Mw. These relations are not perfect and most of the time are site specific. However, we will try to convert all the seismic magnitude into the Moment magnitude.

In [None]:
### ----------  Equations present in the literacy to convert Seismic magnitude to Moment magnitude--------------
#--------- Thatcher & Hanks (1973) 
# Mo = 10^(Ml *1.5 + 9.0)   
#--------- Kanamori 1977.
# Mw = (2/3)*log10(10^(Mo)) - 10.7
#--------- Sitaram and Bora (2007) 
# Md = 1.5*Ml -0.17
#--------- (Scordilis, 2006):
# -- for 3.0 ≤ MS ≤ 6.1,
# Mw = 0.67 MS + 2.07
# -- for 6.2 ≤ MS ≤ 8.2
# Mw = 0.99MS + 0.08
# -- for 3.5 ≤ mb ≤ 6.2
# Mw = 0.85mb + 1.03

condlist = [df['Magnitude Type']== 'ML',
            df['Magnitude Type']== 'MD',
           (df['Magnitude Type']== 'MS')&(df['Magnitude']<= 6.1),
           (df['Magnitude Type']== 'MS')&(df['Magnitude']> 6.1),
           (df['Magnitude Type']== 'MB')&(df['Magnitude']<= 6.2)]

choicelist = [(2/3)*np.log10(10**((df['Magnitude'] *1.5) + 9.0))-10.7,
              0.93*df['Magnitude'] + 0.35,
              0.67*df['Magnitude'] + 2.07,
              0.99*df['Magnitude'] + 0.08,
              0.85*df['Magnitude'] + 1.03]
df['Moment magnitude'] =  np.select(condlist, choicelist,default = df['Magnitude'])

### comparasison magnitude vs seismic moment

In [None]:
import cartopy.crs as ccrs
import cartopy

In [None]:
plt.figure(figsize=(10,10))

# plot map with magnitude
ax1 = plt.axes(projection=ccrs.PlateCarree())
sc = ax1.scatter(df['Longitude'], df['Latitude'],s= df['Magnitude'],c=df['Magnitude'], transform=ccrs.PlateCarree(),
           label=df['Magnitude'])
# produce a legend with the unique colors from the scatter
legend1 = ax1.legend(*sc.legend_elements(),loc="upper right", title="magnitudes",bbox_to_anchor=[1.2,1])
ax1.add_artist(legend1)

ax1.stock_img()
ax1.coastlines()

ax1.set_global()
ax1.gridlines(draw_labels=True, dms=False, x_inline=False, y_inline=False)

ax1.set_title('Earthquake locations 1965-2016')

plt.show()

In [None]:
plt.figure(figsize=(10,10))
# plot map with magnitude
ax2 = plt.axes(projection=ccrs.PlateCarree())
sc = ax2.scatter(df['Longitude'], df['Latitude'],s= df['Moment magnitude']*2,c=df['Moment magnitude'], transform=ccrs.PlateCarree(),
           label=df['Moment magnitude'])
# produce a legend with the unique colors from the scatter
legend2 = ax2.legend(*sc.legend_elements(),loc="upper right", title="Moment magnitude",bbox_to_anchor=[1.2,1])
ax2.add_artist(legend2)

ax2.stock_img()
ax2.coastlines()

ax2.set_global()
ax2.gridlines(draw_labels=True, dms=False, x_inline=False, y_inline=False)

ax2.set_title('Earthquake locations 1965-2016')

plt.show()

Some of the calculated Moment magnitudes are way too low. It doesn't work... I will keep working with the different earthquake magnitude

In [None]:
df =df.drop(['Type','ID', 'Source', 'Location Source',
              'Magnitude Source', 'Status','Moment magnitude'], axis=1)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>3- Earthquake clustering and feature engineering</center></h3>

The goal here is to classify the eartquake into 4 groups:

- **Extension**: ridge oceanique 
- **Subduction**: the oceanic lithosphere dives beneath the continental lithosphere
- **Convergence**: two continental lithospheres collide
- **Transform**: tectonic plates slide sideways past each other

The strategy is to create many small clusters (25), looked at their locations, assign them to a tectonic setting, and do some correction manually if needed.

In [None]:
## ------------------- 1 - Create 25 clusters  -----------------
# Import KMeans
from sklearn.cluster import KMeans

X = df[['Latitude','Longitude']].values

# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=25, random_state=2)

# Fit model to points
model.fit(X)
cluster = model.predict(X)

# add results to df 
df['cluster'] = 'None'
df['cluster'] = pd.Series(cluster, index=df.index)

In [None]:
## ------------------- 2- plot the 25 cluster on a map  -----------------
plt.figure(figsize=(10,10))

ax = plt.axes(projection=ccrs.PlateCarree())
sc = ax.scatter(df['Longitude'], df['Latitude'],c=df['cluster'], transform=ccrs.PlateCarree(),
           label=df['cluster'])
ax.stock_img()
ax.coastlines()
ax.gridlines(draw_labels=True, dms=False, x_inline=False, y_inline=False)
ax.set_title('Earthquake clustering')

ax.set_global()

In [None]:
## -------------------  3- assign each cluster to the relevant geological setting
df.loc[(df['cluster']== 0), ['Tecto-setting',"region"]] = 'Subduction','Pacific plate West'
df.loc[(df['cluster']== 1), ['Tecto-setting',"region"]] = 'Subduction','Philipine sea plate'
df.loc[(df['cluster']== 2), ['Tecto-setting',"region"]] = 'Subduction','Carribbean plate'
df.loc[(df['cluster']== 3), ['Tecto-setting',"region"]] = 'Convergence','Europe'
df.loc[(df['cluster']== 4), ['Tecto-setting',"region"]] = 'Subduction','Pacific plate West'
df.loc[(df['cluster']== 5), ['Tecto-setting',"region"]] = 'Subduction','Pacific plate North'
df.loc[(df['cluster']== 6), ['Tecto-setting',"region"]] = 'Extension','Antartic plate'
df.loc[(df['cluster']== 7), ['Tecto-setting',"region"]] = 'Convergence','Indian plate'
df.loc[(df['cluster']== 8), ['Tecto-setting',"region"]] = 'Subduction','Pacific plate North'
df.loc[(df['cluster']== 9), ['Tecto-setting',"region"]] = 'Extension','Antartic plate'
df.loc[(df['cluster']== 10), ['Tecto-setting',"region"]] = 'Subduction','Philipine sea plate'
df.loc[(df['cluster']== 11), ['Tecto-setting',"region"]] = 'Subduction','Nazca plate'
df.loc[(df['cluster']== 12), ['Tecto-setting',"region"]] = 'Extension','Antartic plate'
df.loc[(df['cluster']== 13), ['Tecto-setting',"region"]] = 'Transform','Pacific plate East'
df.loc[(df['cluster']== 14), ['Tecto-setting',"region"]] = 'Extension','Antartic plate'
df.loc[(df['cluster']== 15), ['Tecto-setting',"region"]] = 'Subduction','Pacific plate West'
df.loc[(df['cluster']== 16), ['Tecto-setting',"region"]] = 'Extension','Ridge Atlantic'
df.loc[(df['cluster']== 17), ['Tecto-setting',"region"]] = 'Extension','Antartic plate'
df.loc[(df['cluster']== 18), ['Tecto-setting',"region"]] = 'Subduction','Australian plate'
df.loc[(df['cluster']== 19), ['Tecto-setting',"region"]] = 'Extension','Pacific plate East'
df.loc[(df['cluster']== 20), ['Tecto-setting',"region"]] = 'Subduction','Philipine sea plate'
df.loc[(df['cluster']== 21), ['Tecto-setting',"region"]] = 'Convergence','Western Asia'
df.loc[(df['cluster']== 22), ['Tecto-setting',"region"]] = 'Extension','Ridge Atlantic'
df.loc[(df['cluster']== 23), ['Tecto-setting',"region"]] = 'Subduction','Cosco plate'
df.loc[(df['cluster']== 24), ['Tecto-setting',"region"]] = 'Subduction','Philipine sea plate'

# drop cluster
df =df.drop(['cluster'], axis=1)

## ---------------- 4- fix manually few mistakes 
df.loc[(df['Longitude']>=0)&(df['Longitude']<150)&
       (df['Latitude']>=70)&(df['Latitude']<=90), ['Tecto-setting',"region"]] = 'Extension','Rigde Artic'
     
df.loc[(df['Longitude']>=0)&(df['Longitude']<43)
             &(df['Latitude']>=-5)&(df['Latitude']<=30), ['Tecto-setting','region']] = 'Extension','Arabian plate'
       
df.loc[(df['Longitude']>=43)&(df['Longitude']<70)
             &(df['Latitude']>=-5)&(df['Latitude']<=19), ['Tecto-setting','region']] = 'Extension','Arabian plate'

df.loc[(df['Longitude']>=-120)&(df['Longitude']<-100)
             &(df['Latitude']>=-5)&(df['Latitude']<=13), ['Tecto-setting','region']] = 'Extension','Pacific plate East'

df.loc[(df['Longitude']>=-100)&(df['Longitude']<-90)
             &(df['Latitude']>=-5)&(df['Latitude']<=5), ['Tecto-setting','region']] = 'Extension','Pacific plate East'

df.loc[(df['Longitude']>=-90)&(df['Longitude']<-80)
             &(df['Latitude']>=-70)&(df['Latitude']<=-30), ['Tecto-setting','region']] = 'Extension','Antartic plate'

df.loc[(df['Longitude']>=-100)&(df['Longitude']<-68)
             &(df['Latitude']>=-15)&(df['Latitude']<=12), ['Tecto-setting','region']] = 'Subduction','Nazca plate'

df.loc[(df['Longitude']>=-20)&(df['Longitude']<-0)
             &(df['Latitude']>=-53)&(df['Latitude']<=23), ['Tecto-setting','region']] = 'Extension','Ridge Atlantic'

df.loc[(df['Longitude']>=156)&(df['Longitude']<180)
             &(df['Latitude']>=-56)&(df['Latitude']<=38), ['Tecto-setting','region']] = 'Subduction','Pacific plate West'

df.loc[(df['Longitude']>=20)&(df['Longitude']<50)
             &(df['Latitude']>=-32)&(df['Latitude']<=0), ['Tecto-setting','region']] = 'Extension','Africain rift'

In [None]:
# create map with the different geological setting
df_convergence = df[df['Tecto-setting'] == 'Convergence']
df_subduction = df[df['Tecto-setting'] == 'Subduction']
df_extension = df[df['Tecto-setting'] == 'Extension']
df_transform = df[df['Tecto-setting'] == 'Transform']

In [None]:
# create map with the different geological setting
plt.figure(figsize=(10,10))
# plot map
ax = plt.axes(projection=ccrs.PlateCarree())
sc = sns.scatterplot(x= 'Longitude', y='Latitude', transform=ccrs.PlateCarree(),data=df, hue='Tecto-setting')

ax.legend(loc="upper right", title=" Tectonic setting",bbox_to_anchor=[1.3,1])
ax.stock_img()
ax.coastlines()
ax.gridlines(draw_labels=True, dms=False, x_inline=False, y_inline=False)

ax.set_global()

In [None]:
# create map with the different geological setting
plt.figure(figsize=(10,10))
# plot map
ax = plt.axes(projection=ccrs.PlateCarree())
sc = sns.scatterplot(x= 'Longitude', y='Latitude', transform=ccrs.PlateCarree(),data=df, hue='region')

ax.legend(loc="upper right", title=" Regions",bbox_to_anchor=[1.3,1])
ax.stock_img()
ax.coastlines()
ax.gridlines(draw_labels=True, dms=False, x_inline=False, y_inline=False)

ax.set_global()

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>4- EDA</center></h3>

In [None]:
fig = plt.figure(figsize=(5,5))

# subplot 1
ax = df['Tecto-setting'].value_counts(normalize=True).plot.pie(legend=False,shadow=True, autopct='%1.1f%%' ,
                                                              pctdistance=0.75, radius=1.05, wedgeprops = {'linewidth': 0.1}, 
                                                              textprops = {'fontsize': 12},normalize=False)
ax.set_title('Earthquake vs tectonic setting', fontsize=16, weight='bold')
ax.legend(fancybox=True, shadow=True, title='Tectonic setting', fontsize=11,loc='upper right', bbox_to_anchor=(1.7, 1))
ax.set_ylabel("")
plt.show()

In [None]:
fig = plt.figure(figsize=(15,4))

ax1 = fig.add_subplot(1,3,1)
ax1 = sns.boxplot(x="Tecto-setting", y="Magnitude", data=df)

ax2 = fig.add_subplot(1,3,2)
ax2 = sns.boxplot(x="Tecto-setting", y="Root Mean Square", data=df)

ax3 = fig.add_subplot(1,3,3)
ax3 = sns.boxplot(x="Tecto-setting", y="Depth", data=df)


plt.show()


In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)
    # x-data for the ECDF: x 
    x = np.sort(data)
    # y-data for the ECDF: y  The y data of the ECDF go from 1/n to 1 in equally spaced increments. 
    y = np.arange(1,n+1) / n
    
    return x, y

In [None]:
# define the variable mags
mags_convergence = df_convergence['Magnitude']
mags_subduction = df_subduction['Magnitude']
mags_extension = df_extension['Magnitude']
mags_transform = df_transform['Magnitude']

# define figure size
fig = plt.figure(figsize=(10,8))
fig.subplots_adjust(hspace=0.4,wspace=0.3)

# figure title
fig.suptitle('Empirical Cumulative Distribution Function', fontsize=18)

# subplot 1
ax1 = fig.add_subplot(2,2,1)
ax1 = plt.plot(*ecdf(mags_extension),marker='.',linestyle = 'none')
ax1 = plt.xlabel('magnitude')
ax1 = plt.ylabel('ECDF')
ax1 = plt.text(6.5, 0.2, 'max magnitude {}'.format(mags_extension.max()),fontsize=12)
ax1 = plt.title('Extension regime')

# subplot 2
ax2 = fig.add_subplot(2,2,2)
ax2 = plt.plot(*ecdf(mags_subduction),marker='.',linestyle = 'none',color='orange')
ax2 = plt.xlabel('magnitude')
ax2 = plt.ylabel('ECDF') 
ax2 = plt.text(6.5, 0.2, 'max magnitude {}'.format(mags_subduction.max()),fontsize=12)
ax2 = plt.title('Subduction regime')

# subplot 3
ax3 = fig.add_subplot(2,2,3)
ax3 = plt.plot(*ecdf(mags_convergence),marker='.',linestyle = 'none',color='green')
ax3 = plt.xlabel('magnitude')
ax3 = plt.ylabel('ECDF') 
ax3 = plt.text(6.5, 0.2, 'max magnitude {}'.format(mags_convergence.max()),fontsize=12)
ax3 = plt.title('Convergence zone')

# subplot 4
ax4 = fig.add_subplot(2,2,4)
ax4 = plt.plot(*ecdf(mags_transform),marker='.',linestyle = 'none',color='red')
ax4 = plt.xlabel('magnitude')
ax4 = plt.ylabel('ECDF')
ax4 = plt.text(6.5, 0.2, 'max magnitude {}'.format(mags_transform.max()),fontsize=12)
ax4 = plt.title('Transform zone')

plt.show()

### 4.2- earthquake properites vs region

In [None]:
fig = plt.figure(figsize=(5,5))

# subplot 1
ax = df['region'].value_counts(normalize=True).plot.pie(legend=False,shadow=True, autopct='%1.1f%%' ,
                                                              pctdistance=0.75, radius=1.05, wedgeprops = {'linewidth': 0.1}, 
                                                              textprops = {'fontsize': 12},normalize=False)
ax.set_title('Earthquake per region', fontsize=16, weight='bold')
ax.legend(fancybox=True, shadow=True, title='Region', fontsize=11,loc='upper right', bbox_to_anchor=(1.9, 1))
ax.set_ylabel("")
plt.show()

In [None]:
fig = plt.figure(figsize=(15,4))

ax1 = fig.add_subplot(1,3,1)
grouped_mag = df.loc[:,['region', 'Magnitude']].groupby(['region']).median().sort_values(by='Magnitude')
ax1 = sns.boxplot(x="region", y="Magnitude", data=df,order = grouped_mag.index)
ax1 = plt.xticks(rotation=90)

ax2 = fig.add_subplot(1,3,2)
grouped_RMS = df.loc[:,['region', 'Root Mean Square']].groupby(['region']).median().sort_values(by='Root Mean Square')
ax2 = sns.boxplot(x="region", y="Root Mean Square", data=df,order = grouped_RMS.index)
ax2 = plt.xticks(rotation=90)

ax3 = fig.add_subplot(1,3,3)
grouped_depth = df.loc[:,['region', 'Depth']].groupby(['region']).median().sort_values(by='Depth')
ax3 = sns.boxplot(x="region", y="Depth", data=df,order = grouped_depth.index)
ax3 = plt.xticks(rotation=90)

plt.show()


### 4.3- Time and distance between two consecutive earthquakes per region 

In [None]:
def calcul_distance_and_time_between_quake_per_region(region):
    df_region = df[df['region']==region]
    # need to calculate abs() to avoid pb with region having earthquake with both positive and negative lat or long
    lat=abs(df_region['Latitude'])
    long = abs(df_region['Longitude'])
    # calucl distance two consecutive eartquakes: 1degree = 111 km
    df_region['distance']= (((lat.diff())*111)**2 + ((long.diff())*111)**2 + 
                            (df_region['Depth'].diff())**2)**0.5
    # calucl time between two consecutive eartquake:
    df_region['delta'] = df_region['datetime'].diff().dt.days
    # remove firt row
    df_region = df_region.iloc[1:]
    return df_region

df_Philipine = calcul_distance_and_time_between_quake_per_region('Philipine sea plate')
df_Pacific_west = calcul_distance_and_time_between_quake_per_region('Pacific plate West')
df_Antartic = calcul_distance_and_time_between_quake_per_region('Antartic plate')
df_Indian = calcul_distance_and_time_between_quake_per_region('Indian plate')
df_Australian = calcul_distance_and_time_between_quake_per_region('Australian plate')
df_Pacific_north = calcul_distance_and_time_between_quake_per_region('Pacific plate North')
df_west_asia = calcul_distance_and_time_between_quake_per_region('Western Asia')
df_Ridge_Atlantic = calcul_distance_and_time_between_quake_per_region('Ridge Atlantic')
df_Nazca = calcul_distance_and_time_between_quake_per_region('Nazca plate')
df_Cosco = calcul_distance_and_time_between_quake_per_region('Cosco plate')
df_Europe = calcul_distance_and_time_between_quake_per_region('Europe')
df_Pacific_east = calcul_distance_and_time_between_quake_per_region('Pacific plate East')
df_Arabian = calcul_distance_and_time_between_quake_per_region('Arabian plate')
df_Carribbean = calcul_distance_and_time_between_quake_per_region('Carribbean plate')
df_Africain_rift = calcul_distance_and_time_between_quake_per_region('Africain rift')
df_Rigde_Artic = calcul_distance_and_time_between_quake_per_region('Rigde Artic')

In [None]:
# SeabornFig2grid: https://stackoverflow.com/questions/35042255/how-to-plot-multiple-seaborn-jointplot-in-subplot/47664533#47664533


class SeabornFig2Grid():

    def __init__(self, seaborngrid, fig,  subplot_spec):
        self.fig = fig
        self.sg = seaborngrid
        self.subplot = subplot_spec
        if isinstance(self.sg, sns.axisgrid.FacetGrid) or \
            isinstance(self.sg, sns.axisgrid.PairGrid):
            self._movegrid()
        elif isinstance(self.sg, sns.axisgrid.JointGrid):
            self._movejointgrid()
        self._finalize()

    def _movegrid(self):
        """ Move PairGrid or Facetgrid """
        self._resize()
        n = self.sg.axes.shape[0]
        m = self.sg.axes.shape[1]
        self.subgrid = gridspec.GridSpecFromSubplotSpec(n,m, subplot_spec=self.subplot)
        for i in range(n):
            for j in range(m):
                self._moveaxes(self.sg.axes[i,j], self.subgrid[i,j])

    def _movejointgrid(self):
        """ Move Jointgrid """
        h= self.sg.ax_joint.get_position().height
        h2= self.sg.ax_marg_x.get_position().height
        r = int(np.round(h/h2))
        self._resize()
        self.subgrid = gridspec.GridSpecFromSubplotSpec(r+1,r+1, subplot_spec=self.subplot)

        self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1])
        self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1])
        self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1])

    def _moveaxes(self, ax, gs):
        #https://stackoverflow.com/a/46906599/4124317
        ax.remove()
        ax.figure=self.fig
        self.fig.axes.append(ax)
        self.fig.add_axes(ax)
        ax._subplotspec = gs
        ax.set_position(gs.get_position(self.fig))
        ax.set_subplotspec(gs)

    def _finalize(self):
        plt.close(self.sg.fig)
        self.fig.canvas.mpl_connect("resize_event", self._resize)
        self.fig.canvas.draw()

    def _resize(self, evt=None):
        self.sg.fig.set_size_inches(self.fig.get_size_inches())

In [None]:
def create_graph_Time_vs_distance(df,title):
    #  subplot 1
    g0 = sns.JointGrid(data=df, x="delta", y="Magnitude", ratio= 3, marginal_ticks=True)
    g0.plot_joint(sns.scatterplot, s=50, alpha=.7)
    g0.plot_joint(sns.kdeplot, color="r",fill=True,alpha=.8)
    g0.plot_joint(sns.kdeplot, color="k",fill=False)
    g0.plot_marginals(sns.histplot, kde=True)
    # label
    g0.set_axis_labels('Time between 2 consecutive earthquake', 'Magnitude', fontsize=16)

    g1 = sns.JointGrid(data=df, x="delta", y="distance", ratio= 3, marginal_ticks=True)
    g1.plot_joint(sns.scatterplot, s=50, alpha=.7,color='k')
    g1.plot_joint(sns.kdeplot, color="g",fill=True,alpha=.8)
    g1.plot_joint(sns.kdeplot, color="k",fill=False)
    g1.plot_marginals(sns.histplot, kde=True)
    # label
    g1.set_axis_labels('Time between 2 consecutive earthquake', 'Distance', fontsize=16)

    fig = plt.figure(figsize=(15,5))
    gs = gridspec.GridSpec(1, 2)

    mg0 = SeabornFig2Grid(g0, fig, gs[0])
    mg1 = SeabornFig2Grid(g1, fig, gs[1])

    gs.tight_layout(fig)

    fig.suptitle('{}:'.format(title),fontsize=18,y=1.1)
    
    plt.show()

In [None]:
create_graph_Time_vs_distance(df_Philipine,'Philipine sea plate')
create_graph_Time_vs_distance(df_Pacific_west,'Pacific plate West')
create_graph_Time_vs_distance(df_Antartic,'Antartic plate')
create_graph_Time_vs_distance(df_Indian,'Indian plate')
create_graph_Time_vs_distance(df_Australian,'Australian plate')
create_graph_Time_vs_distance(df_Pacific_north,'Pacific plate North')
create_graph_Time_vs_distance(df_west_asia,'Western Asia')
create_graph_Time_vs_distance(df_Ridge_Atlantic,'Ridge Atlantic')
create_graph_Time_vs_distance(df_Nazca,'Nazca plate')
create_graph_Time_vs_distance(df_Cosco,'Cosco plate')
create_graph_Time_vs_distance(df_Europe,'Europe')
create_graph_Time_vs_distance(df_Pacific_east,'Pacific plate East')
create_graph_Time_vs_distance(df_Arabian,'Arabian plate')
create_graph_Time_vs_distance(df_Carribbean,'Carribbean plate')
create_graph_Time_vs_distance(df_Africain_rift,'Africain rift')
create_graph_Time_vs_distance(df_Rigde_Artic,'Rigde Artic')

The graphs on the left show that :
- In general, earthquakes are followed by another earthquakes in the following 1 or 2 months, and this is especially true if the first earthake is of large magnitude. 

The graphs on the right show :
- In general, the second earthquake occurs shortly after and close to the first earthquake. However, the second earthquake may also happen everywhere along the plate boundarys, and so be located far from the first earthquake, but if the time between the two earthquakes increases the second earthquake will be closer to the last one. (see Indian plate, Nazca plate, Cosco plate, Europe)


## <a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>5: Machine learning: the Nazca plate</center></h3>

## 5.1: data preparation
We will try to predict the seismic energy released every month along the Nazca boundary plate and the monthly number of seismic events. 

The seismic energy released during each seismic event is calculated from the moment magnitude (Mw), so we can drop the other type of magnitude.

### 5.1.1: feature engineering


In [None]:
df_Nazca_Mw = df_Nazca.loc[(df_Nazca['Magnitude Type']!='MS')&(df_Nazca['Magnitude Type']!='MB')]

print('Number of seismic events dropped: {}'.format((df_Nazca.shape[0]-df_Nazca_Mw.shape[0])))

In [None]:
# # calcul the seismic moment = energy release during an earthquake
df_Nazca_Mw.loc[:,'Energy'] = 10**((3/2)*df_Nazca_Mw.loc[:,'Magnitude']+16.1)
# add 'count' to count number of earthquake during a month 
df_Nazca_Mw.loc[:,'count'] =1

df_Nazca_Mw = df_Nazca_Mw.set_index('datetime')

### 5.1.2: create a time Series with a constant time intervals (here month).

In [None]:
df_Nazca_Mw = df_Nazca_Mw[['Energy','count']]
df_Nazca_Mw = df_Nazca_Mw.resample('M').sum()
df_Nazca_Mw['log_Energy'] = np.log(df_Nazca_Mw['Energy'])

In [None]:
from numpy import inf
df_Nazca_Mw['log_Energy'][df_Nazca_Mw['log_Energy'] == -inf] = 0

In [None]:
fig, axes = plt.subplots(2,1, figsize=(12, 10), sharex=True)
ax1, ax2 = axes.flatten()

plt.suptitle('Monthly evolutions of seismic energy released  and number of earthquake along the Nazca plate',size=18) 
 

ax1.plot(df_Nazca_Mw.index, df_Nazca_Mw["log_Energy"],color='darkblue')
# ax1.set_yscale('log')
ax1.set_ylabel('Monthly amount of seismic energy released', color='darkblue',fontsize=15)
ax1.tick_params(axis='both', which='major', labelsize=14)
ax1.tick_params('y', colors='darkblue')

ax2.plot(df_Nazca_Mw.index,df_Nazca_Mw['count'],color='green')
ax2.set_ylabel('Number of earthquake per month', color='green',fontsize=15)
ax2.tick_params('y', colors='green')
ax2.tick_params(axis='both', which='major', labelsize=14)

plt.show()


We can see there are not data from ~1972 to ~1984, so I won't use the data befor ~1984.

In [None]:
df_Nazca_Mw = df_Nazca_Mw[df_Nazca_Mw.index >= '1982-08-01']

fig, axes = plt.subplots(2,1, figsize=(12, 10), sharex=True)
ax1, ax2 = axes.flatten()

plt.suptitle('Monthly evolutions of seismic energy released  and number of earthquake along the Nazca plate',size=18) 

ax1.plot(df_Nazca_Mw.index, df_Nazca_Mw["log_Energy"],color='darkblue')
# ax1.set_yscale('log')
ax1.set_ylabel('Monthly amount of seismic energy released', color='darkblue',fontsize=15)
ax1.tick_params(axis='both', which='major', labelsize=14)
ax1.tick_params('y', colors='darkblue')

ax2.plot(df_Nazca_Mw.index,df_Nazca_Mw['count'],color='green')
ax2.set_ylabel('Number of earthquake per month', color='green',fontsize=15)
ax2.tick_params('y', colors='green')
ax2.tick_params(axis='both', which='major', labelsize=14)

plt.show()


## 5.2: Check the  Stationarity of a Time Series
We can assume the series to be stationary if it has constant statistical properties over time, ie. the following:
- constant mean
- constant variance
- an autocovariance that does not depend on time.

### 5.2.1: Check the  Stationarity by Plotting Rolling Statistics: 
We can plot the moving average and moving variance to see if they varies with time. 

In [None]:
# Determing rolling statistics for monthly energy released
rolmean_energy = df_Nazca_Mw['log_Energy'].rolling(window=12).mean()
rolstd_energy = df_Nazca_Mw['log_Energy'].rolling(window=12).std()

# Determing rolling statistics for monthly number of earthquake
rolmean_number = df_Nazca_Mw['count'].rolling(window=12).mean()
rolstd_number = df_Nazca_Mw['count'].rolling(window=12).std()

In [None]:
# Plot rolling statistics:
fig, axes = plt.subplots(1,2, figsize=(15,6))
ax1, ax2 = axes.flatten()

plt.suptitle('Rolling Mean & Standard Deviation',size=18) 

ax1.plot(df_Nazca_Mw['log_Energy'],'b-',label = 'TS: energy')
ax1.plot(rolmean_energy,'r-',label = 'Rolling Mean')
ax1.plot(rolstd_energy,'k-',label = 'Rolling std')
ax1.set_ylabel('monthly seismic energy released', color='darkblue',fontsize=15)
ax1.tick_params(axis='both', which='major', labelsize=14)
ax1.tick_params('y', colors='darkblue')
ax1.legend(loc='best')

ax2.plot(df_Nazca_Mw['count'],'g-',label = 'TS: number quake')
ax2.plot(rolmean_number,'r-',label = 'Rolling Mean')
ax2.plot(rolstd_number,'k-',label = 'Rolling std')
ax2.set_ylabel('monthly number earthquake', color='green',fontsize=15)
ax2.tick_params(axis='both', which='major',color='green', labelsize=14)
ax2.tick_params('y', colors='darkblue')
ax2.legend(loc='best')

plt.show()

In [None]:
# other test:
X = df_Nazca_Mw['log_Energy']
split = len(X) // 2
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))

We can observed that **the means** and **the standard deviations** seem to be more or less **constant** so these two time series are  **stationary**.

However, to be sure we can perform The Dickey-Fuller test.

### 5.2.2: Check the  Stationarity with the Dickey-Fuller test:
Here the null hypothesis is that the Time Series is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.

In [None]:
# Perform The Dickey-Fuller test
from statsmodels.tsa.stattools import adfuller

print('result of The Dickey-Fuller test:')
dftest = adfuller(df_Nazca_Mw['log_Energy'],autolag = 'AIC')

dfoutput = pd.Series(dftest[0:4], index=['test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key, value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
    
print(dfoutput)

Here:
- the p-value is less than the significance level (0.05) then we can reject the null hypothesis and infer that the time series is indeed stationary.

- Moreover, the test statistic (~-19) is less than the Critical Values (from -2 to -3) so indeed the series is stationary.

In [None]:
# Perform The Dickey-Fuller test

print('result of The Dickey-Fuller test:')
dftest = adfuller(df_Nazca_Mw['count'],autolag = 'AIC')

dfoutput = pd.Series(dftest[0:4], index=['test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key, value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
    
print(dfoutput)

Here:
- the p-value is less than the significance level (0.05) then we can reject the null hypothesis and infer that the time series is indeed stationary.

- Moreover, the test statistic (~-15) is less than the Critical Values (from -2 to -3) so indeed the series is stationary.

## 5.3: Split the data

In [None]:
# slip data
y_train = df_Nazca_Mw['1982-08-01':'2014-11-01']
y_test = df_Nazca_Mw['2014-11-01':'2016-12-01']

# percentage training
Percentage_training = (len(y_train)*100)/(len(y_train)+len(y_test))
Percentage_testing = (len(y_test)*100)/(len(y_train)+len(y_test))
print('The data is splited in {:.2f}% training and {:.2f}% testing.'.format(Percentage_training,Percentage_testing))

In [None]:
plt.figure(figsize=(10,5))

plt.title('Time Series Split',size=16) 

plt.plot(y_train['log_Energy'],'b-',label = 'training')
plt.plot(y_test['log_Energy'],'r-',label = 'testing')
plt.ylabel('monthly seismic energy released (log)', fontsize=15)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.tick_params('y', colors='darkblue')
plt.legend(loc='best')
plt.show()

## 5.4: Forecasting a Time Series with ARIMA
ARIMA stands for Auto-Regressive Integrated Moving Averages. The ARIMA forecasting for a stationary time series is nothing but a linear (like a linear regression) equation. The predictors depend on the parameters (p,d,q) of the ARIMA model:

- 1: **Number of AR (Auto-Regressive) terms (p)**: AR terms are just lags of dependent variable. For instance if p is 5, the predictors for x(t) will be x(t-1)….x(t-5).

- 2: **Number of MA (Moving Average) terms (q)**: MA terms are lagged forecast errors in prediction equation. For instance if q is 5, the predictors for x(t) will be e(t-1)….e(t-5) where e(i) is the difference between the moving average at ith instant and actual value.

- 3: **Number of Differences (d)**: These are the number of nonseasonal differences, i.e. in this case we took the first order difference. So either we can pass that variable and put d=0 or pass the original variable and put d=1. Both will generate same results.

To determine the value of ‘p’ and ‘q’, we use two plots ACF and PACF:

- **Autocorrelation Function (ACF)**: It is a measure of the correlation between the TS with a lagged version of itself. For instance at lag 5, ACF would compare series at time instant ‘t1’…’t2’ with series at instant ‘t1-5’…’t2-5’ (t1-5 and t2 being end points).
- **Partial Autocorrelation Function (PACF)**: This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4.

In [None]:
adf = adfuller(y_test['log_Energy'])
print("p-value of microsoft: {}".format(float(adf[1])))

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import statsmodels.api as sm
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(y_train['log_Energy'],lags=40,ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(y_train['log_Energy'],lags=40,ax=ax2)

This time series has: Constant mean, constant variance and a Zero auto-correlation at all lags, it is white noise. It cannot be predict.

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import statsmodels.api as sm
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(y_train['count'],lags=20,ax=ax1,color='green')
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(y_train['count'],lags=20,ax=ax2,color='green')

In this plot, the two dotted lines on either sides of 0 are the confidence interevals. These can be used to determine the ‘p’ and ‘q’ values as:

p – The lag value where the PACF chart crosses the upper confidence interval for the first time. Here, p=1.

q – The lag value where the ACF chart crosses the upper confidence interval for the first time. here, q=1

### 5.4: ARIMA model for number eartquake

In [None]:
model = ARIMA(y_train['count'], order=(1,0,1))
results_ARIMA = model.fit()

print(results_ARIMA .summary())

fig = plt.figure(figsize=(12,8))
plt.plot(y_train['count'])
plt.plot(results_ARIMA.fittedvalues, color='green')
plt.title('RSS: %.4f'%sum((results_ARIMA.fittedvalues - y_train['count'])**2))
print('Plotting ARIMA model')

In [None]:
# Plot residual errors
residuals = pd.DataFrame(results_ARIMA.resid)
fig, ax = plt.subplots(1,2,figsize=(15,6))
residuals.plot(title="Residuals on count", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()

In [None]:
# fcast = results_ARIMA.get_forecast(steps=122).summary_frame()
fcast = results_ARIMA.get_forecast(steps=122).summary_frame()
plt.figure(figsize=(10,5))

plt.title('Forecast',size=16) 

plt.plot(y_train['count'],'b-',label = 'training')
plt.plot(y_test['count'],'r-',label = 'testing')
plt.plot(fcast['mean'],'g',label = 'prediction')
plt.fill_between(fcast.index, fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1);
plt.ylabel('monthly seismic energy released (log)', fontsize=15)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.tick_params('y', colors='darkblue')
plt.legend(loc='best')
plt.show()



the data doesn't have strong seasonality and the model finds difficult to predict the future therefore it simply take average of your previous values and predict as future. There fore we are getting straight line.