# Oklahoma Earthquakes and Saltwater Injection Wells
## Earthquakes in Oklahoma region and Oil and Gas fluid byproduct data.

#### in progress...

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime as dt

# map creation
import cartopy.crs as ccrs
import cartopy
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER

# data visualization 
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns

# stat on data
from scipy import stats
from scipy.stats import norm, skew

# feature reduction
from sklearn.decomposition import PCA

#--- data clustering
from sklearn import cluster

#---- Machine learning
# data preparation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# hyperparameter tunnig
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# Import necessary modules for neutral network
import keras
from keras.layers import Dense, BatchNormalization
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
# Model evaluation
import math
from sklearn import metrics
from statsmodels.graphics.api import abline_plot

## <a id="1"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>1: SEISMIC DATA</center></h3>


## 1.1: Load the seismic data

In [None]:
# Load the seismic catalog 
catalogue = pd.read_csv(r'../input/oklahoma-earthquakes-and-saltwater-injection-wells/okQuakes.csv')
catalogue.head(3)

## 1.2: Plot earthquakes locations

In [None]:
plt.figure(figsize=(10,10))

ax1 = plt.axes(projection=ccrs.PlateCarree())
ax1.set_extent([-105, -93, 29,41], crs=ccrs.PlateCarree())

# add color
ax1.add_feature(cfeature.OCEAN.with_scale('10m'))
ax1.add_feature(cfeature.LAND)
ax1.add_feature(cfeature.STATES)
ax1.add_feature(cfeature.RIVERS)
ax1.coastlines()

# add grid
gl = ax1.gridlines(crs=ccrs.PlateCarree(), draw_labels=True, linewidth=1, color='darkgray', alpha=0.5, linestyle='--')
gl.top_labels = False
gl.right_labels = False
gl.xlocator = mticker.FixedLocator([-102,-99,-96])
gl.ylocator = mticker.FixedLocator([33, 36,39])
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}
gl.ylabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}

#
ax1 = sns.scatterplot(x =catalogue['longitude'], y=catalogue['latitude'], hue =catalogue['mag'],size =catalogue['mag'])

# set title
ax1.set_title('seismic events in catalogue',size=15)

plt.show()

In [None]:
# select seismic event around Oklahoma 
catalogue_ok = catalogue[(catalogue['longitude']>=-102)&(catalogue['longitude']<=-94.5)&
                        (catalogue['latitude']>=33)&(catalogue['latitude']<=39)]
catalogue_ok.head(2)

In [None]:
plt.figure(figsize=(10,10))

# SHOW LOCATION OF THE GEYSER GEOTHERMAL FIELD
ax1 = plt.axes(projection=ccrs.PlateCarree())
ax1.set_extent([-105, -93, 29,41], crs=ccrs.PlateCarree())

# add color
ax1.add_feature(cfeature.OCEAN.with_scale('10m'))
ax1.add_feature(cfeature.LAND)
ax1.add_feature(cfeature.STATES)
ax1.add_feature(cfeature.RIVERS)
ax1.coastlines()

# add grid
gl = ax1.gridlines(crs=ccrs.PlateCarree(), draw_labels=True, linewidth=1, color='darkgray', alpha=0.5, linestyle='--')
gl.top_labels = False
gl.right_labels = False
gl.xlocator = mticker.FixedLocator([-102,-99,-96])
gl.ylocator = mticker.FixedLocator([33, 36,39])
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}
gl.ylabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}

# Selected earthquakes
ax1 = sns.scatterplot(x =catalogue_ok['longitude'], y=catalogue_ok['latitude'], hue =catalogue_ok['mag'],size =catalogue_ok['mag'])

# set title
ax1.set_title('Selected seismic events',size=15)

plt.show()

### 1.3: Clean the dataframe

#### 1.3.1: set date as index

In [None]:
catalogue_ok['time'] = pd.to_datetime(catalogue_ok['time'])
catalogue_ok = catalogue_ok.set_index('time')
catalogue_ok.tail(2)

#### 1.3.2: Verify and replace missing values

In [None]:
# finction to visualize missing value
def get_percentage_nan_values(data, thresh=20, color='black', edgecolor='black', width=15, height=3):
    """
    visualize the percentage of missing values in each columns
    SOURCE: https://www.kaggle.com/amiiiney/price-prediction-regularization-stacking
    """
    
    plt.figure(figsize=(width,height))
    percentage=(data.isnull().mean())*100
    percentage.sort_values(ascending=False).plot.bar(color=color, edgecolor=edgecolor)
    plt.axhline(y=thresh, color='r', linestyle='-')
    plt.title('Missing values percentage per column', fontsize=20, weight='bold' )
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh+12.5, f'Columns with more than {thresh}% missing values', fontsize=12, color='crimson',
         ha='left' ,va='top')
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh - 5, f'Columns with less than {thresh} missing values', fontsize=12, color='green',
         ha='left' ,va='top')
    plt.xlabel('Columns', size=15, weight='bold')
    plt.ylabel('Missing values percentage')
    plt.yticks(weight ='bold')
    
    return plt.show()

In [None]:
# show percentage and distribution missing values per columns
get_percentage_nan_values(catalogue_ok, 20, color=sns.color_palette('Reds',15))

In [None]:
# drop columns with more than 20% of missing value
catalogue_ok = catalogue_ok.dropna(thresh=len(catalogue_ok)*0.8, axis=1)

In [None]:
catalogue_ok.isnull().sum()

In [None]:
# replace missing value by mean columns
catalogue_ok = catalogue_ok.fillna(catalogue_ok.mean())

In [None]:
# used countplot to see the dominant category for magType
sns.countplot(x='magType',data=catalogue_ok)
plt.show()

In [None]:
# replace nan value in 'magType' by the dominant category: 'ml'
catalogue_ok['magType'] = catalogue_ok['magType'].replace(np.nan,'ml')

## 1.4: Statistical analyses
### 1.4.1: Evolution over time

In [None]:
fig = plt.figure(figsize=(10,5))

# Selected earthquakes
ax1 = sns.scatterplot(x =catalogue_ok.index, y=catalogue_ok['mag'], hue =catalogue_ok['depth'])
ax1 = plt.axvline(dt(2008,1,1), ymin=0, ymax=6,color="black", linestyle="--")

# set title
plt.title('number of seismic event from 1975 to 2018',size=15)
plt.show() 

We observed a strong increase in seismicity after ~2008.
We also observed one earthquake at a depth of 56 km

### 1.4.2: Plot the ECDF of the Earthquake magnitudes

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)
    # x-data for the ECDF: x 
    x = np.sort(data)
    # y-data for the ECDF: y  The y data of the ECDF go from 1/n to 1 in equally spaced increments. 
    y = np.arange(1,n+1) / n
    
    return x, y

In [None]:
# Get magnitudes before and after 2008
before_2008 = catalogue_ok[catalogue_ok.index < '2008-01-01']
after_2008  =  catalogue_ok[catalogue_ok.index >= '2008-01-01']

In [None]:
# define figure size
fig = plt.figure(figsize=(7,5))

# figure title
fig.suptitle('Empirical Cumulative Distribution Function', fontsize=18)

mags_before_2008 = before_2008['mag']
mags_after_2008 = after_2008['mag']
# get number event and max magnitude before and after 2008
max_before_2008 = before_2008['mag'].max() 
nb_before_2008 = len(before_2008['mag'])
max_after_2008 = after_2008['mag'].max() 
nb_after_2008 = len(after_2008['mag'])

# plot ECDF
ax1 = plt.plot(*ecdf(mags_before_2008),marker='.',linestyle = 'none',label='before 2008')
ax2 = plt.plot(*ecdf(mags_after_2008),marker='.',linestyle = 'none',label='after 2008')
ax1 = plt.xlabel('magnitude')
ax1 = plt.ylabel('ECDF')

ax1 = plt.text(2.6, 0.0, 'max mag after 2008: {}'.format(max_after_2008),fontsize=12)
ax1 = plt.text(2.6, 0.1, 'Nb. events after 2008: {}'.format(nb_after_2008),fontsize=12)

ax1 = plt.text(-0.05, 0.65, 'max mag before 2008: {}'.format(max_before_2008),fontsize=12)
ax1 = plt.text(-0.05, 0.75, 'Nb. events before 2008: {}'.format(nb_before_2008),fontsize=12)

plt.legend(fontsize=14)
plt.show()

Both curves seem to follow the Gutenberg-Richter Law, but with different completeness thresholds, probably due to improvements in sensing capabilities in more recent years.

### 1.4.3: computing b-values

#### Functions used to compute and compare b-values

In [None]:
# define the fonction needed for statistical analysis:
def bootstrap_replicate_1d(data, func):
    """Generate bootstrap replicate of 1D data."""
    bs_sample = np.random.choice(data, len(data))
    return func(bs_sample)

def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data,func)

    return bs_replicates

# define fonction to compute b-value with confident interval
def b_value(mags, mt, perc=[2.5, 97.5], n_reps=None):
    """Compute the b-value and optionally its confidence interval."""
    # Extract magnitudes above completeness threshold: m
    m = mags[mags >= mt]

    # Compute b-value: b
    b = (np.mean(m)-mt)*np.log(10)

    # Draw bootstrap replicates
    if n_reps is None:
        return b
    else:
        m_bs_reps = draw_bs_reps(m, np.mean, n_reps)

        # Compute b-value from replicates: b_bs_reps
        b_bs_reps = (m_bs_reps - mt) * np.log(10)

        # Compute confidence interval: conf_int
        conf_int = np.percentile(b_bs_reps, [2.5, 97.5])
    
        return b, conf_int
    
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)

    return perm_replicates

def diff_of_means(data_3, data_4):
    """Difference in means of two arrays."""

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_3)-np.mean(data_4)

    return diff

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate([data1,data2])

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

In [None]:
mt = 2.8
# Compute b-value and confidence interval for pre-2010
b_pre, conf_int_pre = b_value(mags_before_2008, mt, perc=[2.5, 97.5], n_reps=10000)

# Compute b-value and confidence interval for post-2010
b_post, conf_int_post = b_value(mags_after_2008, mt, perc=[2.5, 97.5], n_reps=10000)

# Report the results
print("""
Before 2008:
b-value: {0:.2f}
95% conf int: [{1:.2f}, {2:.2f}]

After 2008
b-value: {3:.2f}
95% conf int: [{4:.2f}, {5:.2f}]
""".format(b_pre, *conf_int_pre, b_post, *conf_int_post))


#### Is the b-value in Oklahoma before 2008 is really different from the one after 2008?
To answer this question we will:
- include earthquakes that have magnitudes above the completeness threshold. A value of 3 is reasonable.
- perform a permutation test because asserting a null hypothesis that the b-values are the same implicitly assumes that the magnitudes are identically distributed, specifically Exponentially, by the Gutenberg-Richter Law.
- plook at the difference between the mean post-2008 magnitude and the mean pre-2008 magnitude.

In [None]:
mt = 3

# step 1: select only magnitudes above completeness threshold
mags_before_2008 = mags_before_2008[mags_before_2008 >= 2.5]
mags_after_2008 = mags_after_2008[mags_after_2008 >= 2.5]

# step 2: Observed difference in mean magnitudes: diff_obs
diff_obs = np.mean(mags_before_2008 ) - np.mean(mags_after_2008)

# Generate permutation replicates: perm_reps
perm_reps = draw_perm_reps(mags_after_2008, mags_before_2008, diff_of_means, size=10000)

# Compute and print p-value
p_val = np.sum(perm_reps < diff_obs) / 10000
print('p =', p_val)

The p-value is about 1.0, suggesting we should reject the null hypothesis that the b-value has not changed.

## 1.5: Earthquakes clustering
### 1.5.1: cluster creation 

In [None]:
# create df with features to be used for clustering
df_for_cluster = catalogue_ok[['latitude','longitude']]

In [None]:
numClusters = [10,18,19,20,21,22,23,24,25,30,40,50,80]
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(df_for_cluster)
    SSE.append(k_means.inertia_)

plt.plot(numClusters, SSE,'-o')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.grid()
plt.show()


In [None]:
# create 20 clusters and assign cluster id to catalogue_ok
k_means_meq = cluster.KMeans(n_clusters=20, random_state=1)
k_means_meq.fit(df_for_cluster) 
labels = k_means_meq.labels_
centroids = k_means_meq.cluster_centers_
df_label = pd.DataFrame(labels, index=df_for_cluster.index, columns=['Cluster ID'])
df_centroid = pd.DataFrame(centroids,columns=['latitude','longitude'])

catalogue_ok = pd.concat([catalogue_ok,df_label],axis=1)

In [None]:
plt.figure(figsize=(8,8))

# SHOW LOCATION OF THE GEYSER GEOTHERMAL FIELD
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_extent([-105, -93, 29,41], crs=ccrs.PlateCarree())

# add color
ax.add_feature(cfeature.OCEAN.with_scale('10m'))
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.STATES)
ax.add_feature(cfeature.RIVERS)
ax.coastlines()

# add grid
ax = ax.gridlines(crs=ccrs.PlateCarree(), draw_labels=True, linewidth=1, color='darkgray', alpha=0.5, linestyle='--')
ax.top_labels = False
ax.right_labels = False
ax.xlocator = mticker.FixedLocator([-102,-99,-96])
ax.ylocator = mticker.FixedLocator([33, 36,39])
ax.xformatter = LONGITUDE_FORMATTER
ax.yformatter = LATITUDE_FORMATTER
ax.xlabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}
ax.ylabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}

# define color
qualitative_colors = sns.color_palette("Set2", 20)

# Selected earthquakes
ax = sns.scatterplot(x = catalogue_ok['longitude'], y= catalogue_ok['latitude'],hue = catalogue_ok['Cluster ID'],
                      palette = qualitative_colors,legend= False)
# plot centroid
ax = sns.scatterplot(x = df_centroid['longitude'], y= df_centroid['latitude'],
                      hue = df_centroid.index, palette = qualitative_colors,s=100,marker='^',edgecolor = 'k')

# set title
ax.set_title('cluster seismic events',size=15)
plt.show()

### 1.5.2: cluster properties 
#### b-values

In [None]:
# calculate b-value in each cluster after 2008
mt = 2.8
list_b_value_cluster = []
for i in range(20):
    df_c = pd.DataFrame()
    df_c = catalogue_ok[(catalogue_ok.index >= '2008-01-01')&(catalogue_ok['Cluster ID']==i)]
    b = b_value(df_c['mag'], mt, perc=[2.5, 97.5])
    list_b_value_cluster.append((i,b))
    
b_value_cluster = pd.DataFrame(list_b_value_cluster,columns=['clusterid','b-value']) 

### MEQ properties

In [None]:
cluster_attributes = catalogue_ok.groupby('Cluster ID').median()
cluster_attributes = pd.concat([cluster_attributes,b_value_cluster],axis=1)
cluster_attributes = cluster_attributes.drop(columns=['clusterid'],axis=1)

In [None]:
corr = cluster_attributes.corr(method='pearson')

fig, axes = plt.subplots(1,figsize=(8,8))
ax0 = plt.subplot(1,1,1)
sns.heatmap(corr,annot=True,linewidths=.5, annot_kws={"size": 10},vmin=-1.0, vmax=1.0,square=True,cbar=True)
ax0.set_title('correlations between numerical variables',size=18,y=1.05)
ax0.set_yticklabels(ax0.get_yticklabels(), rotation=0,size=14) 
ax0.set_xticklabels(ax0.get_xticklabels(), rotation=90,size=14) 
plt.show()

Interesting correlations between:
- latitude and mag/rms,
- depth and longitude,
- depth and b-value

- horizontal and depth errors with mag and rms (may highlight the influence of the seismic network (location of the seismic stations) on the seismic catalogue?)

## <a id="2"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>2: INJECTION DATA</center></h3>

In [None]:
# step1: Load the lists all active saltwater injection wells in the state of Oklahoma.
df_inj_well = pd.read_csv(r'../input/oklahoma-earthquakes-and-saltwater-injection-wells/InjectionWells.csv')
df_inj_well.head(3)

In [None]:
df_inj_well.describe()

The minimal latitude is '0'....

In [None]:
df_inj_well = df_inj_well[(df_inj_well['LONG']>-105)&(df_inj_well['LONG']<-93)&
                         (df_inj_well['LAT']>29)&(df_inj_well['LAT']<41)]

In [None]:
# show percentage and distribution missing values per columns
get_percentage_nan_values(df_inj_well, 20, color=sns.color_palette('Reds',15))

In [None]:
# drop columns with more than 20% of missing value
df_inj_well = df_inj_well.dropna(thresh=len(catalogue_ok)*0.8, axis=1)

In [None]:
columns_to_drop = ['Operator ID','OrderNumbers','ZONE','QQQQ','County','Sec','Twp','Rng','WellName','WellNumber']
df_inj_well = df_inj_well.drop(columns_to_drop,axis=1)
df_inj_well.tail()

In [None]:
# drop last row:
df_inj_well = df_inj_well[:-1]
# define API number as string
df_inj_well['API#'] = df_inj_well.loc[:'API#'].astype(str)
# check nan values
print("number of wells: {}".format(df_inj_well.shape[0]))
df_inj_well.isnull().sum()

In [None]:
df_inj_well.head(2)

In [None]:
df_inj_well.dtypes

In [None]:
# define string as number
df_inj_well['PSI'] = pd.to_numeric(df_inj_well['PSI'], errors='coerce')
df_inj_well['BBLS'] = pd.to_numeric(df_inj_well['BBLS'], errors='coerce')
df_inj_well = df_inj_well.dropna()

### EDA

In [None]:
plt.figure(figsize=(8,8))

# SHOW LOCATION OF THE GEYSER GEOTHERMAL FIELD
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_extent([-105, -93, 29,41], crs=ccrs.PlateCarree())

# add color
ax.add_feature(cfeature.OCEAN.with_scale('10m'))
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.STATES)
ax.add_feature(cfeature.RIVERS)
ax.coastlines()

# add grid
ax = ax.gridlines(crs=ccrs.PlateCarree(), draw_labels=True, linewidth=1, color='darkgray', alpha=0.5, linestyle='--')
ax.top_labels = False
ax.right_labels = False
ax.xlocator = mticker.FixedLocator([-102,-99,-96])
ax.ylocator = mticker.FixedLocator([33, 36,39])
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}
gl.ylabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}

# Selected earthquakes
ax1 = sns.scatterplot(x =df_inj_well['LONG'], y=df_inj_well['LAT'], hue =df_inj_well['PSI'],size =df_inj_well['PSI'])

# set title
ax1.set_title('location injection wells',size=15)

plt.show()

### Injection wells clustering

In [None]:
# create df with features to be used for clustering
df_inj_well_for_cluster = df_inj_well[['LAT','LONG']]

In [None]:
# calculate b-value in each cluster after 2008
numClusters = [10,18,19,20,21,22,23,24,25,30,40,50,80]
SSE = []
for k in numClusters:
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(df_inj_well_for_cluster)
    SSE.append(k_means.inertia_)

plt.plot(numClusters, SSE,'-o')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.grid()
plt.show()


In [None]:
# create 20 clusters and assign cluster id to catalogue_ok
k_means = cluster.KMeans(n_clusters=20, random_state=1)
k_means.fit(df_inj_well_for_cluster) 
labels = k_means.labels_
centroids = k_means.cluster_centers_
df_label = pd.DataFrame(labels, index=df_inj_well.index, columns=['Cluster ID'])
df_centroid_inj = pd.DataFrame(centroids,columns=['latitude','longitude'])

df_inj_well = pd.concat([df_inj_well,df_label],axis=1)

## <a id="2"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>3: comparison earthquake - injection</center></h3>

## 3.1: earthquake' locations vs injection wells location

In [None]:
plt.figure(figsize=(8,8))

# SHOW LOCATION OF THE GEYSER GEOTHERMAL FIELD
ax = plt.axes(projection=ccrs.PlateCarree())
ax.set_extent([-105, -93, 29,41], crs=ccrs.PlateCarree())

# add color
ax.add_feature(cfeature.OCEAN.with_scale('10m'))
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.STATES)
ax.add_feature(cfeature.RIVERS)
ax.coastlines()

# add grid
ax = ax.gridlines(crs=ccrs.PlateCarree(), draw_labels=True, linewidth=1, color='darkgray', alpha=0.5, linestyle='--')
ax.top_labels = False
ax.right_labels = False
ax.xlocator = mticker.FixedLocator([-102,-99,-96])
ax.ylocator = mticker.FixedLocator([33, 36,39])
ax.xformatter = LONGITUDE_FORMATTER
ax.yformatter = LATITUDE_FORMATTER
ax.xlabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}
ax.ylabel_style = {'size': 13, 'color': 'gray', 'weight': 'bold'}

# define color
qualitative_colors = sns.color_palette("Set2", 20)
h = .02

# Selected earthquakes
ax = sns.scatterplot(x = df_inj_well['LONG'], y= df_inj_well['LAT'],hue = df_inj_well['Cluster ID'],
                      palette = qualitative_colors,legend= False)
# plot centroid
ax = sns.scatterplot(x = df_centroid['longitude'], y= df_centroid['latitude'],s=100,marker='o',edgecolor = 'k')
ax = sns.scatterplot(x = df_centroid_inj['longitude'], y= df_centroid_inj['latitude'],
                      hue = df_centroid_inj.index, palette = qualitative_colors,s=100,marker='^',edgecolor = 'k')




# set title
ax.set_title('centroids meq cluster vs centroids inj well location cluster',size=15)
plt.show()

no clear relation between earthquake location and eartquake location

## 3.2: comparison evolution MEQ and number of injection wells

In [None]:
catalogue_ok['count'] = 1
catalogue_ok['cumsum'] = catalogue_ok['count'].cumsum()
catalogue_ok.drop('count',axis = 1,inplace = True)

df_inj_well['count'] = 1
df_inj_well['cumsum'] = df_inj_well['count'].cumsum()
df_inj_well.drop('count',axis = 1,inplace = True)

plt.plot(catalogue_ok.index, catalogue_ok['cumsum'])
plt.plot(df_inj_well.index, df_inj_well['cumsum'])
