In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/earthquake/all_month.csv')

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Keep only name of a state from 'place' column 
    </p>

In [None]:
import re
df['short place']=[re.findall(r'\w+',i)[-1] for i in df['place']]
df.dropna(subset=['mag'],inplace=True)

<p style = "font-family:palatino linotype,serif;font-size:25px;"> 
    Make features
    </p>

In [None]:
features=[i for i in df.columns if df[i].isna().sum()==0] # features include only place, type and source of an earthquake
for i in ['mag','place','time','id','updated','net','magType','depth']:
    features.remove(i)
    
X=df[features]
y=df[['mag','depth', 'depthError']] # we try to predict magnitude, depth as well as depthError  

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Separate categorical data
    </p>

In [None]:
categorical=[]
for i in features:
    if df[i].dtype=="object":
        categorical.append(i)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Encode categorical data to integers
    </p>

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for i in categorical:
    X[i]=le.fit_transform(X[i])
for i in [i for i in y.columns if y[i].dtype=='object']:
    y[i]=le.fit_transform(y[i])

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Split the data into test and train
    </p>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Build a random forest regressor
    </p>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
clf = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
clf.fit(X_train, y_train)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Cross-validation
    </p>

In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(clf, X, y, cv=5).mean())

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    Hyperparameters tuning by gridsearch
    </p>

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [10, 25], 'max_features': [5, 10], 
 'max_depth': [10, 50, None], 'bootstrap': [True, False]}
]

grid_search_forest = GridSearchCV(clf, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search_forest.fit(X_train, y_train)

<p style = "font-family:palatino linotype,serif;font-size:25px;">
    in my case +few % of the score
    </p>

In [None]:
print(cross_val_score(grid_search_forest.best_estimator_, X, y, cv=5).mean())

 <p style = "font-family:palatino linotype,serif;font-size:25px;">
    Let's plot heatmap to check correlation of prediction and test data
    </p>

In [None]:
y_predicted=grid_search_forest.best_estimator_.predict(X_test)
y_predicted=pd.DataFrame(y_predicted, columns=y_test.columns)
corrmatrix=pd.DataFrame()
for i in y_predicted.columns:
    corrmatrix[i+'_predicted']=y_predicted[i].values
    corrmatrix[i+'_test']=y_test[i].values
import plotly.graph_objects as go
x=list(corrmatrix.corr().index)
y=list(corrmatrix.corr().columns)
values=np.array(corrmatrix.corr().values)
fig = go.Figure(data=go.Heatmap(
    z=values,
    x=x,
    y=y,
                 
    
                   hoverongaps = False))
fig.show()