In [None]:
import numpy as np
import pandas as pd
from numpy import nan

from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier,GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import KFold,cross_val_score,StratifiedKFold
from sklearn.utils import resample
from sklearn.model_selection import train_test_split,cross_validate,cross_val_predict,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,auc
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score,precision_recall_curve,accuracy_score,roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold,RFECV
from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2,mutual_info_classif
from sklearn.calibration import CalibratedClassifierCV,calibration_curve,CalibrationDisplay
from sklearn.metrics import multilabel_confusion_matrix
import xgboost as xgb
import os

from sklego.meta import Thresholder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, make_scorer

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

import geopandas as gpd
import cartopy.crs as ccrs

In [None]:
codeDir = os.path.dirname(os.path.abspath(os.getcwd()))
parentDir = os.path.dirname(codeDir)

In [None]:
df = pd.read_feather(parentDir+'\\intermediate_datadev2')

In [None]:
# remove samples where max mrms intensity < min possible gage intensity
min_int = pd.read_feather(parentDir+'\\min_intensity_gage')
min_int['gage_id'] = min_int.index
min_int.min_intensity = min_int.min_intensity
df['min_int'] = [min_int.loc[min_int.gage_id==df.gage_id[i][0]].min_intensity.values[0] for i in df.index]

df = df.query('max_mrms > min_int')

df = df.reset_index(drop=True).drop(columns=['min_int','gage_id','max_accum_atgage'])

In [None]:
# remove samples less than 1km apart, keep first
distance_lessthan = 1
df = remove_closest(df,distance_lessthan)

In [None]:
# shift lon to 255.5, was 255 when i developed dataset
df = df.loc[df.longitude<255.5]

In [None]:
df['label']=0
df.loc[(df.AR_peak<=1/.3)&(df.AR_peak>=.3)&(df.AR_accum<=1/.25)&(df.AR_accum>=.25),['label']]=1

df = df.loc[(df.AR_peak!=np.inf)&(df.AR_accum!=np.inf)]
df = df.drop(columns=['AR_peak','AR_accum'])
df = df.drop(columns=['std_int_point','std_accum_point','mean_accum_point','accum_mean_storm','accum_std_storm'])

df = df.dropna()

In [None]:
data = df.drop(columns='label')
labels = df.label.values
scaler = StandardScaler()
data=scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=1)

In [None]:
# no zeros 
clf = GradientBoostingClassifier()
clf.fit(data,labels)

x = cross_validate(clf,data,labels, cv = cv,
                     scoring=['f1','accuracy','roc_auc','precision','recall'])

print(x['test_f1'])
print('f1: '+str(round(x['test_f1'].mean(),2)))

print(x['test_accuracy'])
print('accuracy: '+str(round(x['test_accuracy'].mean(),2)))

print(x['test_roc_auc'])
print('roc_auc: '+str(round(x['test_roc_auc'].mean(),2)))

print(x['test_precision'])
print('precision: '+str(round(x['test_precision'].mean(),2)))

print(x['test_recall'])
print('recall: '+str(round(x['test_recall'].mean(),2)))

In [None]:
# remove when mrms reading below min gage recording

clf = GradientBoostingClassifier()
clf.fit(data,labels)

x = cross_validate(clf,data,labels, cv = cv,
                     scoring=['f1','accuracy','roc_auc','precision','recall'])

print(x['test_f1'])
print('f1: '+str(round(x['test_f1'].mean(),2)))

print(x['test_accuracy'])
print('accuracy: '+str(round(x['test_accuracy'].mean(),2)))

print(x['test_roc_auc'])
print('roc_auc: '+str(round(x['test_roc_auc'].mean(),2)))

print(x['test_precision'])
print('precision: '+str(round(x['test_precision'].mean(),2)))

print(x['test_recall'])
print('recall: '+str(round(x['test_recall'].mean(),2)))

In [None]:
# remove when mrms reading below min gage recording + buffer

clf = GradientBoostingClassifier()
clf.fit(data,labels)

x = cross_validate(clf,data,labels, cv = cv,
                     scoring=['f1','accuracy','roc_auc','precision','recall'])

print(x['test_f1'])
print('f1: '+str(round(x['test_f1'].mean(),2)))

print(x['test_accuracy'])
print('accuracy: '+str(round(x['test_accuracy'].mean(),2)))

print(x['test_roc_auc'])
print('roc_auc: '+str(round(x['test_roc_auc'].mean(),2)))

print(x['test_precision'])
print('precision: '+str(round(x['test_precision'].mean(),2)))

print(x['test_recall'])
print('recall: '+str(round(x['test_recall'].mean(),2)))

In [None]:
# remove when mrms reading below min gage recording, shift threshold, label small int 1

clf = GradientBoostingClassifier()
clf.fit(data,labels)

x = cross_validate(clf,data,labels, cv = cv,
                     scoring=['f1','accuracy','roc_auc','precision','recall'])

print(x['test_f1'])
print('f1: '+str(round(x['test_f1'].mean(),2)))

print(x['test_accuracy'])
print('accuracy: '+str(round(x['test_accuracy'].mean(),2)))

print(x['test_roc_auc'])
print('roc_auc: '+str(round(x['test_roc_auc'].mean(),2)))

print(x['test_precision'])
print('precision: '+str(round(x['test_precision'].mean(),2)))

print(x['test_recall'])
print('recall: '+str(round(x['test_recall'].mean(),2)))

In [None]:
# change min intensity

clf = GradientBoostingClassifier()
clf.fit(data,labels)

x = cross_validate(clf,data,labels, cv = cv,
                     scoring=['f1','accuracy','roc_auc','precision','recall'])

print(x['test_f1'])
print('f1: '+str(round(x['test_f1'].mean(),2)))

print(x['test_accuracy'])
print('accuracy: '+str(round(x['test_accuracy'].mean(),2)))

print(x['test_roc_auc'])
print('roc_auc: '+str(round(x['test_roc_auc'].mean(),2)))

print(x['test_precision'])
print('precision: '+str(round(x['test_precision'].mean(),2)))

print(x['test_recall'])
print('recall: '+str(round(x['test_recall'].mean(),2)))