In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import fbeta_score, make_scorer
import pickle


In [25]:
df = pd.read_csv("../data/nfip_claims_ML.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'basementEnclosureCrawlspace', 'condominiumIndicator',
       'policyCount', 'countyCode', 'elevatedBuildingIndicator',
       'elevationCertificateIndicator', 'floodZone', 'latitude', 'longitude',
       'locationOfContents', 'numberOfFloorsInTheInsuredBuilding',
       'nonProfitIndicator', 'obstructionType', 'occupancyType',
       'originalConstructionDate', 'amountPaidOnBuildingClaim',
       'amountPaidOnContentsClaim',
       'amountPaidOnIncreasedCostOfComplianceClaim',
       'postFIRMConstructionIndicator', 'rateMethod', 'state',
       'totalBuildingInsuranceCoverage', 'totalContentsInsuranceCoverage',
       'yearOfLoss', 'reportedZipcode', 'primaryResidence',
       'totalinsurancecoverage', 'amountPaidonTotalClaim', 'new_elev_diff',
       'floodzone_code', 'MonthOfLosss', 'BuildingsConstructionYear',
       'LossRatio', 'claim_segmentation'],
      dtype='object')

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,basementEnclosureCrawlspace,condominiumIndicator,policyCount,countyCode,elevatedBuildingIndicator,elevationCertificateIndicator,floodZone,latitude,longitude,locationOfContents,numberOfFloorsInTheInsuredBuilding,nonProfitIndicator,obstructionType,occupancyType,originalConstructionDate,amountPaidOnBuildingClaim,amountPaidOnContentsClaim,amountPaidOnIncreasedCostOfComplianceClaim,postFIRMConstructionIndicator,rateMethod,state,totalBuildingInsuranceCoverage,totalContentsInsuranceCoverage,yearOfLoss,reportedZipcode,primaryResidence,totalinsurancecoverage,amountPaidonTotalClaim,new_elev_diff,floodzone_code,MonthOfLosss,BuildingsConstructionYear,LossRatio,claim_segmentation
0,0,1,N,1,24033,0,0,X,38,-77,0,3,0,999,1,1953-01-01T00:00:00.000Z,0,0,0,0.0,7,MD,100000,40000,2007,20745,1.0,140000,0,0,X,1,1953,0.0,0
1,1,0,N,1,30009,1,0,AE,45,-109,0,2,0,50,1,1975-01-01T00:00:00.000Z,593,0,0,0.0,1,MT,150000,0,2011,59068,1.0,150000,593,0,A,7,1975,0.003953,1
2,2,0,N,1,48201,0,3,AE,29,-95,0,2,0,10,1,1983-01-01T00:00:00.000Z,0,0,0,1.0,1,TX,25000,25000,2001,77025,1.0,50000,0,0,A,6,1983,0.0,0
3,3,0,N,1,12103,1,0,AE,27,-82,0,2,0,10,1,1969-01-01T00:00:00.000Z,0,0,0,0.0,1,FL,150000,35000,1996,33702,1.0,185000,0,-1,A,10,1969,0.0,0
4,4,0,N,1,24029,1,0,AE,39,-76,0,2,0,10,1,1980-01-01T00:00:00.000Z,31311,4329,0,0.0,1,MD,75000,10000,2003,21620,0.0,85000,35640,0,A,9,1980,0.419294,1


In [26]:
df.drop(["amountPaidonTotalClaim", "yearOfLoss", "originalConstructionDate", "Unnamed: 0", "floodZone", "LossRatio", "amountPaidOnBuildingClaim", "amountPaidOnContentsClaim", "amountPaidOnIncreasedCostOfComplianceClaim", "reportedZipcode", "latitude", "longitude", "countyCode"], inplace=True, axis=1)
df.claim_segmentation = [0 if x==1 else 1 for x in df.claim_segmentation]

In [9]:
#df = pd.get_dummies(df, drop_first=True)
#df.head()

Unnamed: 0,basementEnclosureCrawlspace,policyCount,countyCode,elevatedBuildingIndicator,elevationCertificateIndicator,latitude,longitude,locationOfContents,numberOfFloorsInTheInsuredBuilding,nonProfitIndicator,obstructionType,occupancyType,postFIRMConstructionIndicator,totalBuildingInsuranceCoverage,totalContentsInsuranceCoverage,reportedZipcode,primaryResidence,totalinsurancecoverage,amountPaidonTotalClaim,new_elev_diff,MonthOfLosss,BuildingsConstructionYear,claim_segmentation,condominiumIndicator_H,condominiumIndicator_L,condominiumIndicator_N,condominiumIndicator_Not_specified,condominiumIndicator_U,rateMethod_2,rateMethod_3,rateMethod_4,rateMethod_5,rateMethod_6,rateMethod_7,rateMethod_8,rateMethod_9,rateMethod_999,rateMethod_A,rateMethod_B,rateMethod_E,rateMethod_F,rateMethod_G,rateMethod_P,rateMethod_Q,rateMethod_R,rateMethod_RatingEngine,rateMethod_S,rateMethod_T,rateMethod_W,state_AL,state_AR,state_AS,state_AZ,state_CA,state_CO,state_CT,state_DC,state_DE,state_FL,state_GA,state_GU,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_PR,state_RI,state_SC,state_SD,state_TN,state_TX,state_UN,state_UT,state_VA,state_VI,state_VT,state_WA,state_WI,state_WV,state_WY,floodzone_code_B,floodzone_code_C,floodzone_code_D,floodzone_code_N,floodzone_code_V,floodzone_code_X
0,1,1,24033,0,0,38,-77,0,3,0,999,1,0.0,100000,40000,20745,1.0,140000,0,0,1,1953,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,30009,1,0,45,-109,0,2,0,50,1,0.0,150000,0,59068,1.0,150000,593,0,7,1975,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,48201,0,3,29,-95,0,2,0,10,1,1.0,25000,25000,77025,1.0,50000,0,0,6,1983,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,12103,1,0,27,-82,0,2,0,10,1,0.0,150000,35000,33702,1.0,185000,0,-1,10,1969,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,24029,1,0,39,-76,0,2,0,10,1,0.0,75000,10000,21620,0.0,85000,35640,0,9,1980,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
df.to_csv("../data/claims_dummied.csv", index=False)

In [24]:
df.head()

Unnamed: 0,basementEnclosureCrawlspace,condominiumIndicator,policyCount,elevatedBuildingIndicator,elevationCertificateIndicator,locationOfContents,numberOfFloorsInTheInsuredBuilding,nonProfitIndicator,obstructionType,occupancyType,postFIRMConstructionIndicator,rateMethod,state,totalBuildingInsuranceCoverage,totalContentsInsuranceCoverage,primaryResidence,totalinsurancecoverage,amountPaidonTotalClaim,new_elev_diff,floodzone_code,MonthOfLosss,BuildingsConstructionYear,claim_segmentation
0,1,N,1,0,0,0,3,0,999,1,0.0,7,MD,100000,40000,1.0,140000,0,0,X,1,1953,1
1,0,N,1,1,0,0,2,0,50,1,0.0,1,MT,150000,0,1.0,150000,593,0,A,7,1975,0
2,0,N,1,0,3,0,2,0,10,1,1.0,1,TX,25000,25000,1.0,50000,0,0,A,6,1983,1
3,0,N,1,1,0,0,2,0,10,1,0.0,1,FL,150000,35000,1.0,185000,0,-1,A,10,1969,1
4,0,N,1,1,0,0,2,0,10,1,0.0,1,MD,75000,10000,0.0,85000,35640,0,A,9,1980,0


In [37]:
df.claim_segmentation

0          1
1          0
2          1
3          1
4          0
          ..
2337880    0
2337881    0
2337882    0
2337883    1
2337884    1
Name: claim_segmentation, Length: 2337885, dtype: int64

### In the next cell, make sure to remove the target column from the appropiate list! 

In [27]:
# create lists of features that need encoding or scaling
cat_features = list(df.select_dtypes(exclude="number").columns)
num_features = list(df.select_dtypes(include="number").columns)
num_features.remove("claim_segmentation")

In [28]:
# create a column transformer out of a scaler for the numerical and an encoder for the categorical columns 
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer([
    ("num", scaler, num_features),
    ("cat", encoder, cat_features)
])

In [29]:
# build pipe and name its components
pipe = Pipeline([
("preprocessor", preprocessor),
("clf", RandomForestClassifier(random_state=42))
])

### Remove cell/adjust target name as appropiate! 

In [30]:
# train test split
y = df.pop("claim_segmentation")
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [31]:
# make fbeta scorer
#ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [36]:
# define params for random search
rs_params={"clf__max_depth": list(np.arange(10, 100, step=10)) + [None],
              "clf__n_estimators": np.arange(50, 250, step=50),
              "clf__max_features": ["sqrt", "log2"],
              "clf__criterion": ["gini","entropy"],
              "clf__min_samples_leaf": np.arange(2, 10),
              "clf__min_samples_split": np.arange(2, 10, step=2)
          }

In [39]:
# define randomized search
rand = RandomizedSearchCV(pipe, rs_params, n_iter=1, scoring="accuracy", cv=5, n_jobs=-1, random_state=42, verbose=5)

In [40]:
# run randomized search
rand.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [47]:
rand

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               StandardScaler(),
                                                                               ['basementEnclosureCrawlspace',
                                                                                'policyCount',
                                                                                'countyCode',
                                                                                'elevatedBuildingIndicator',
                                                                                'elevationCertificateIndicator',
                                                                                'latitude',
                                                                                'longitude',
                       

In [35]:
# show best parameters
print("Best score:\n{:.2f}".format(rand.best_score_))
print("Best parameters:\n{}".format(rand.best_params_))

Best score:
0.05
Best parameters:
{'clf__n_estimators': 20, 'clf__min_samples_split': 8, 'clf__min_samples_leaf': 7, 'clf__max_features': 'sqrt', 'clf__max_depth': 20, 'clf__criterion': 'entropy'}


In [20]:
# create a results dataframe from the results dict of the classifier
result_df_rand = pd.DataFrame.from_dict(rand.cv_results_, orient="columns")

# plot some results, e.g.
sns.relplot(data=result_df_rand,
            kind="line",
            x="param_clf__n_estimators",
            y="mean_test_score",
            hue="param_clf__max_features",
            col="clf__criterion")
plt.show()

ValueError: Could not interpret value `clf__criterion` for parameter `col`

In [21]:
result_df_rand

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__n_estimators,param_clf__min_samples_split,param_clf__min_samples_leaf,param_clf__max_features,param_clf__max_depth,param_clf__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,585.303834,6.623772,2.103004,1.765751,20,8,7,sqrt,20,entropy,"{'clf__n_estimators': 20, 'clf__min_samples_sp...",1.0,1.0,1.0,1.0,0.999991,0.999998,4e-06,1


In [None]:
pd.to_csv("../data/random_search_results.csv")

In [None]:
# define params for grid search, based on random search results
gs_params = {
               }

In [None]:
# define grid search
grid = GridSearchCV(pipe, param_grid=gs_params, cv=5, scoring=ftwo_scorer, verbose=1, n_jobs=-1)

In [None]:
# run grid search
grid.fit(X_train, y_train)

In [None]:
# show best parameters
print("Best score:\n{:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

In [None]:
# save best model
best_model = grid.best_estimator_
filename = "finalized_model.sav"
pickle.dump(best_model, open(filename, "wb"))