In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import fbeta_score, make_scorer
import pickle


In [2]:
df = pd.read_csv("../data/nfip_claims_ML.csv")
df.drop(["amountPaidonTotalClaim", "yearOfLoss", "originalConstructionDate", "Unnamed: 0", "floodZone", "LossRatio", "amountPaidOnBuildingClaim", "amountPaidOnContentsClaim", "amountPaidOnIncreasedCostOfComplianceClaim"], inplace=True, axis=1)
df.claim_segmentation = [0 if x==1 else 1 for x in df.claim_segmentation]
# create lists of features that need encoding or scaling
cat_features = list(df.select_dtypes(exclude="number").columns)
num_features = list(df.select_dtypes(include="number").columns)
num_features.remove("claim_segmentation")
# create a column transformer out of a scaler for the numerical and an encoder for the categorical columns 
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer([
    ("num", scaler, num_features),
    ("cat", encoder, cat_features)
])
# build pipe and name its components
pipe = Pipeline([
("preprocessor", preprocessor),
("clf", RandomForestClassifier(random_state=42))
])

# train test split
y = df.pop("claim_segmentation")
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train_samp = X_train.sample(frac=0.1, random_state=42)
y_train_samp = y_train[X_train_samp.index]
# define params for random search
rs_params={"clf__max_depth": list(np.arange(10, 100, step=10)) + [None],
              "clf__n_estimators": np.arange(50, 500, step=50),
              "clf__max_features": ["sqrt", "log2"],
              "clf__criterion": ["gini","entropy"],
              "clf__min_samples_leaf": np.arange(1, 10),
              "clf__min_samples_split": np.arange(2, 10, step=2)
          }
# define randomized search
rand = RandomizedSearchCV(pipe, rs_params, n_iter=200, scoring=["f1_weighted", "accuracy"], cv=5, n_jobs=-1, random_state=42, verbose=5, refit="f1_weighted")
# run randomized search
rand.fit(X_train_samp, y_train_samp)

In [None]:
# show best parameters
print("Best score:\n{:.2f}".format(rand2.best_score_))
print("Best parameters:\n{}".format(rand2.best_params_))
# create a results dataframe from the results dict of the classifier
result_df_rand2 = pd.DataFrame.from_dict(rand2.cv_results_, orient="columns")

# plot some results, e.g.
sns.relplot(data=result_df_rand,
            kind="line",
            x="param_clf__n_estimators",
            y="mean_test_score",
            #col="clf__criterion",
            hue="param_clf__max_features"
            )
plt.show()

result_df_rand2.to_csv("../data/random_search_results_with_geo.csv")

Best score:
0.76
Best parameters:
{'clf__n_estimators': 400, 'clf__min_samples_split': 6, 'clf__min_samples_leaf': 5, 'clf__max_features': 'log2', 'clf__max_depth': 70, 'clf__criterion': 'gini'}


In [None]:
result_df_rand

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__n_estimators,param_clf__min_samples_split,param_clf__min_samples_leaf,param_clf__max_features,param_clf__max_depth,param_clf__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,10504.179758,29.86041,42.638269,4.813891,400,6,5,log2,70,gini,"{'clf__n_estimators': 400, 'clf__min_samples_s...",0.765917,0.766273,0.766011,0.765936,0.766007,0.766029,0.000128,1


In [None]:
# define params for grid search, based on random search results
gs_params = {
               }

In [None]:
# define grid search
grid = GridSearchCV(pipe, param_grid=gs_params, cv=5, scoring=ftwo_scorer, verbose=1, n_jobs=-1)

In [None]:
# run grid search
grid.fit(X_train, y_train)

In [None]:
# show best parameters
print("Best score:\n{:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

In [None]:
# save best model
best_model = grid.best_estimator_
filename = "finalized_model.sav"
pickle.dump(best_model, open(filename, "wb"))