In [None]:
# Last amended: 15th April, 2021
# Objectives:
#                i) Data exploration
#               ii) Data Visualization
#              iii) Feature Engineering
#               iv) Modeling
#                v) Model optimization
#
#https://colab.research.google.com/drive/1lHArmyzqCy31EfrC7kM3r-qzB9xwP7LX
#https://www.kaggle.com/c/liberty-mutual-fire-peril

#           i) Liberty Mutual Group - Fire Peril Loss Cost 
#          ii) Predict a transformed ratio of loss to total insured value
# Project by V. Siva Sundara Prasad, Chief Manager - IT

# The problem
A Fortune 100 company, Liberty Mutual Insurance has provided a wide range of insurance products and services designed to meet their customers' ever-changing needs for over 100 years.

Within the business insurance industry, fire losses account for a significant portion of total property losses. High severity and low frequency, fire losses are inherently volatile, which makes modeling them difficult. In this problem, the task is to predict the target, a transformed ratio of loss to total insured value, using the provided information. This will enable more accurate identification of each policyholder’s risk exposure and the ability to tailor the insurance coverage for their specific operation.

The data provided represents almost a million insurance records and the task is to predict a transformed ratio of loss to total insured value (called "target" within the data set). The provided features contain policy characteristics, information on crime rate, geodemographics, and weather.

The train and test sets are split randomly. For each id in the test set, you must predict the target using the provided features.

### Field descriptions

Most of the fields are self-explanatory. The following are descriptions for those that aren't.
>  **id :** A unique identifier of the data set

>  **target :** The transformed ratio of loss to total insured value

>  **dummy :** Nuisance variable used to control the model, but not working as a predictor

>  **var1 – var17 :** A set of normalized variables representing policy characteristics (note: var11 is the weight used in the weighted gini score calculation)

>  **crimeVar1 – crimeVar9:** A set of normalized Crime Rate variables

>  **geodemVar1 – geodemVar37 :** A set of normalized geodemographic variables

>  **weatherVar1 – weatherVar236 :** A set of normalized weather station variables   


### Libraries and data files

In [None]:
# 1.3 Call libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os

In [None]:
# 1.4 Display output of multiple commands from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Read all data

In [None]:
# Read all seven files using pandas
#train = pd.read_csv("train.csv.zip")
train = pd.read_csv("../input/liberty-mutual-fire-peril/train.csv.zip")

test = pd.read_csv("../input/liberty-mutual-fire-peril/test.csv.zip")
# 
#test = pd.read_csv("test.csv.zip")


In [None]:
# Also set options to display all rows/all columns
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [None]:
# Check if read
#       ((452061, 302), (450728, 301))
train.shape 
test.shape

In [None]:
# Reduce the size of Test and Train data frames to 25 % as the memory is being fully exhausted
# The rows are selected randomly

train = train.sample(frac = .25, replace = False)

test = test.sample(frac = .25, replace = False)

In [None]:
# Check the shape of the reduced data set

train.shape # (113015, 302)
test.shape # (112682, 301)

In [None]:
# Import Garbage Collector
import gc 

# Invoke gc
gc.collect()

### Explore train data

In [None]:
# 2.1 Look at train data
print("\n---train----\n")
train.shape         # (452061, 302)
print("\n------train------\n")
train.head()
print("\n-----Summary------\n")
train.describe(include="all")
print("\n-----dtypes------\n")
train.dtypes

In [None]:
#Replace NaN with some random values
#Each set columns are replaced with a different values
for df in (train, test):

  field_names = df.head()
  for x in field_names:
    if x[:3] == 'var':
      df[x] = df[x].fillna(0.1)  #.astype('float64')
    else:
       if x[:8] == 'crimeVar':
         df[x] = df[x].fillna(1.1).astype('float64')
       else:
          if x[:9] == 'geodemVar':
            df[x] = df[x].fillna(2.1).astype('float64')
          else:
             if x[:10] == 'weatherVar':
               df[x] = df[x].fillna(3.1).astype('float64')

In [None]:
# Encode the Categorical values in the train data
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
train["var1_enc"] = ord_enc.fit_transform(train[["var1"]])
#train[["var1", "var1_enc"]].head(11)
train["var2_enc"] = ord_enc.fit_transform(train[["var2"]])
train["var3_enc"] = ord_enc.fit_transform(train[["var3"]])
train["var4_enc"] = ord_enc.fit_transform(train[["var4"]])
train["var5_enc"] = ord_enc.fit_transform(train[["var5"]])
train["var6_enc"] = ord_enc.fit_transform(train[["var6"]])
train["var8_enc"] = ord_enc.fit_transform(train[["var8"]])
train["var9_enc"] = ord_enc.fit_transform(train[["var9"]])
train["var10_enc"] = ord_enc.fit_transform(train[["var10"]])
train["var11_enc"] = ord_enc.fit_transform(train[["var11"]])
train["var12_enc"] = ord_enc.fit_transform(train[["var12"]])
train["var13_enc"] = ord_enc.fit_transform(train[["var13"]])
train["var14_enc"] = ord_enc.fit_transform(train[["var14"]])
train["var15_enc"] = ord_enc.fit_transform(train[["var15"]])
train["var16_enc"] = ord_enc.fit_transform(train[["var16"]])
train["var17_enc"] = ord_enc.fit_transform(train[["var17"]])

In [None]:
# Encode the Categorical values in the actual test data
test["var1_enc"] = ord_enc.fit_transform(test[["var1"]])
test["var2_enc"] = ord_enc.fit_transform(test[["var2"]])
test["var3_enc"] = ord_enc.fit_transform(test[["var3"]])
test["var4_enc"] = ord_enc.fit_transform(test[["var4"]])
test["var5_enc"] = ord_enc.fit_transform(test[["var5"]])
test["var6_enc"] = ord_enc.fit_transform(test[["var6"]])
test["var8_enc"] = ord_enc.fit_transform(test[["var8"]])
test["var9_enc"] = ord_enc.fit_transform(test[["var9"]])
test["var10_enc"] = ord_enc.fit_transform(test[["var10"]])
test["var11_enc"] = ord_enc.fit_transform(test[["var11"]])
test["var12_enc"] = ord_enc.fit_transform(test[["var12"]])
test["var13_enc"] = ord_enc.fit_transform(test[["var13"]])
test["var14_enc"] = ord_enc.fit_transform(test[["var14"]])
test["var15_enc"] = ord_enc.fit_transform(test[["var15"]])
test["var16_enc"] = ord_enc.fit_transform(test[["var16"]])
test["var17_enc"] = ord_enc.fit_transform(test[["var17"]])

In [None]:
# Encode the column dummy in both the train and test data sets 
train["dummy_enc"] = ord_enc.fit_transform(train[["dummy"]])
test["dummy_enc"] = ord_enc.fit_transform(test[["dummy"]])

In [None]:
# View the dataframe 
train.head()

In [None]:
# Add a column each for the 4 categories of columns which consist of the std of all the respective columns
for df in (train, test):
  var_enc_cols = list(df.columns)
  a = var_enc_cols.index("var1_enc") 
  b = var_enc_cols.index("var17_enc") + 1
  a
  print("")
  b
  var_enc_cols = list(df.columns)[a:b]
  var_enc_cols  
  #df["var_enc_cols_mean"] = df[var_enc_cols].mean(axis=1)
  df["var_enc_cols_mean"] = df[var_enc_cols].std(axis=1)
  #df.head() 

  crime_var_cols = list(df.columns)
  a = crime_var_cols.index("crimeVar1") 
  b = crime_var_cols.index("crimeVar9") + 1
  a
  print("")
  b
  crime_var_cols = list(df.columns)[a:b]
  crime_var_cols
  #df["crime_var_cols_mean"] = df[crime_var_cols].mean(axis=1)
  df["crime_var_cols_mean"] = df[crime_var_cols].std(axis=1)
  #df.head()

  geodem_var_cols = list(df.columns)
  a = geodem_var_cols.index("geodemVar1") 
  b = geodem_var_cols.index("geodemVar37") + 1
  a
  print("")
  b
  geodem_var_cols = list(df.columns)[a:b]
  geodem_var_cols
  #df["geodem_var_cols_mean"] = df[geodem_var_cols].mean(axis=1)
  df["geodem_var_cols_mean"] = df[geodem_var_cols].std(axis=1)
  #df.head()

  weather_var_cols = list(df.columns)
  a = weather_var_cols.index("weatherVar1") 
  b = weather_var_cols.index("weatherVar236") + 1
  a
  print("")
  b
  weather_var_cols = list(df.columns)[a:b]
  weather_var_cols
  #df["weather_var_cols_mean"] = df[weather_var_cols].mean(axis=1)
  df["weather_var_cols_mean"] = df[weather_var_cols].std(axis=1)
  df.head()

In [None]:
# Copy Target column to another variable and drop the column
y = train['target'].values
train.drop(columns = ['target'], inplace = True)

In [None]:
# Check the shape of the train after deleting the column
train.shape
train.head()


In [None]:
# drop unnecessary columns in train data set
train.drop(columns = ['id', 'var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8'], inplace = True)
train.drop(columns = ['var9', 'var10', 'var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17', 'dummy'], inplace = True)
train.head()

In [None]:
# Import matplot lib to draw plots for data vizualisation
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Distribution Plot
sns.distplot(train.var_enc_cols_mean)
sns.despine()               # Plot with and without it


In [None]:
# Distribution Plot - on multiple variables from Train dataset
columns = ['dummy_enc', 'var_enc_cols_mean', 'crime_var_cols_mean',	'geodem_var_cols_mean',	'weather_var_cols_mean']
fig = plt.figure(figsize = (10,10))
for i in range(len(columns)):
    plt.subplot(2,3,i+1)
    sns.distplot(train[columns[i]])


In [None]:
# More such relationships through for-loop
columns = ['geodem_var_cols_mean',	'weather_var_cols_mean', 'var_enc_cols_mean', 'crime_var_cols_mean']
catVar = ['dummy_enc'	]

# Now for loop. First create pairs of cont and cat variables
mylist = [(cont,cat)  for cont in columns  for cat in catVar]
mylist

# 6.4 Now run-through for-loop
fig = plt.figure(figsize = (10,10))
for i, k in enumerate(mylist):
    #print(i, k[0], k[1])
    plt.subplot(4,2,i+1)
    sns.boxplot(x = k[1], y = k[0], data = train)

In [None]:
sns.jointplot(train.geodem_var_cols_mean,	train.weather_var_cols_mean)

In [None]:
# Joint plot between var_enc_cols_mean & train.crime_var_cols_mean

sns.jointplot(train.var_enc_cols_mean, train.crime_var_cols_mean,
              kind = "hex"
              )


In [None]:
# Joint plot between geodem_var_cols_mean &	train.weather_var_cols_mean
sns.jointplot(train.geodem_var_cols_mean,	train.weather_var_cols_mean,
              kind = "hex"
              )


In [None]:
# See the power of t-sne
#      (t-distributed Stochastic Neighbor Embedding)

from sklearn.manifold import TSNE

# 11.5.1 Project all data but 'Clicked_on_ad' on two axis
#        Also just replace nc with nc_rand and try again

#dummy_enc	var_enc_mean	crime_var_cols_mean	geodem_var_cols_mean	weather_var_cols_mean

enc_mean_cols = list(train.columns)
a = enc_mean_cols.index("dummy_enc") 
b = enc_mean_cols.index("weather_var_cols_mean") + 1

# X_embedded = TSNE(n_components=2).fit_transform(train.iloc[a:a+3, a+3:b])
X_embedded = TSNE(n_components=2).fit_transform(train.iloc[2:20, 21:30])
X_embedded.shape    # (1000,2), numpy array
df = pd.DataFrame(X_embedded, columns=['X','Y'])

# No two plots will be the same
sns.relplot(x = "X",
            y = "Y",
            hue = train.dummy_enc,    # Colur each point as per 1 or 0
            data = df
            )


In [None]:
# Import Standard Scaler 
from sklearn.preprocessing import StandardScaler as ss

In [None]:
# Scale data using StandardScaler
scale = ss()     # Create an instance of class
scale.fit(train)                # Train object on the data
X = scale.transform(train)      # Transform data
X[:5, :]                  # See first 5 rows

In [None]:
# Import Train Test Split class
from sklearn.model_selection import train_test_split
# Class to develop kmeans model
from sklearn.cluster import KMeans
# Plotting library
import seaborn as sns
# How good is clustering?
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer


In [None]:
# Split train dataset into train/test
X_train, X_test, _, y_test = train_test_split( X,               # np array without target
                                               y,               # Target
                                               test_size = 0.25 # test_size proportion
                                               )
# 4.1 Examine the results
X_train.shape              # (339045, 303)
X_test.shape               # (113016, 303)


In [None]:
# Use sklearn's StandardScaler() to scale dataset
clf = KMeans(n_clusters = 2)
# Train the object over data
clf.fit(X_train)

# So what are our clusters?
clf.cluster_centers_
clf.cluster_centers_.shape         # (2, 303)
clf.labels_                        # Cluster labels for every observation
clf.labels_.size                   # 339045
clf.inertia_                       # 91021988.53256002 Sum of squared distance to respective centriods, SSE


In [None]:
#iv)  Use sklearn's StandardScaler() to scale dataset
# Make prediction over splitted test data and check accuracy
y_pred = clf.predict(X_test)
y_pred
# 6.1 How good is prediction
np.sum(y_pred == y_test)/y_test.size # 0.7837474339916471


In [None]:
#iv)  Use sklearn's StandardScaler() to scale dataset
dx = pd.Series(X_test[:, 0])
dy = pd.Series(X_test[:,1])
sns.scatterplot(dx,dy, hue = y_pred)


In [None]:
# Show the sample test dataframe
test.shape
test.head() # (112682, 322)

In [None]:
# Drop unwanted columns
test.drop(columns = ['id', 'var1', 'var2', 'var3', 'var4', 'var5', 'var6'], inplace = True)
test.drop(columns = ['var7', 'var8', 'var9', 'var10', 'var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17', 'dummy'], inplace = True)
test.head()

In [None]:
# Check the shape of train & actual test 
train.shape # (113015, 303)
test.shape # (112682, 303)

In [None]:
# fit the actual test data using StandardScaler
scale.fit(test)                # Train object on the data
X = scale.transform(test)      # Transform data


In [None]:
# Show first 5 rows
X[:5, :]     

In [None]:
#iv)  Use sklearn's StandardScaler() to scale dataset
clf = KMeans(n_clusters = 2)
# Train the object over data
clf.fit(X)

# So what are our clusters?
clf.cluster_centers_
clf.cluster_centers_.shape         # (2, 303)
clf.labels_                        # Cluster labels for every observation
clf.labels_.size                   # 450728
clf.inertia_                       # 120773499.68828635 Sum of squared distance to respective centriods, SSE


In [None]:
#iv)  Use sklearn's StandardScaler() to scale dataset
# Make prediction over actual test data and check accuracy
y_pred = clf.predict(X)
y_pred
# How good is prediction
np.sum(y_pred == 1)/450728  # 0.803629 (1 - 0.196371)

In [None]:
#iv)  Use sklearn's StandardScaler() to scale dataset and plot 
#  Are clusters distiguisable?
#     We plot 1st and 2nd columns of X
#     Each point is coloured as per the
#     cluster to which it is assigned (y_pred)
dx = pd.Series(X[:, 0])
dy = pd.Series(X[:,1])
sns.scatterplot(dx,dy, hue = y_pred)

In [None]:
# Scree plot: X_train
sse = []
for i,j in enumerate(range(3)):
    # How many clusters?
    n_clusters = i+1
    # Create an instance of class
    clf1 = KMeans(n_clusters = n_clusters)
    # Train the kmeans object over data
    clf1.fit(X_train)
    # Store the value of inertia in sse
    sse.append(clf1.inertia_ )

# Plot the line now
sns.lineplot(range(1, 4), sse)

In [None]:
# Scree plot: X - actual test data
sse = []
for i,j in enumerate(range(3)):
    #  How many clusters?
    n_clusters = i+1
    #  Create an instance of class
    clf1 = KMeans(n_clusters = n_clusters)
    #  Train the kmeans object over data
    clf1.fit(X)
    #  Store the value of inertia in sse
    sse.append(clf1.inertia_ )

# Plot the line now
sns.lineplot(range(1, 4), sse)

In [None]:
gc.collect()

In [None]:
# Import GaussianMixture class
from sklearn.mixture import GaussianMixture

import time

In [None]:
# Perform clustering using Gaussian Mixture Modeling.
gm_liberty = GaussianMixture(
                           n_components = 3,   # More the clusters, more the time
                           n_init = 10,
                           max_iter = 100
                         )

In [None]:
#  Perform clustering using Gaussian Mixture Modeling.
start = time.time()
gm_liberty.fit(X)
end = time.time()
(end - start)/60     # 6.76 minutes


In [None]:
#  Perform clustering using Gaussian Mixture Modeling.
# Did algorithm(s) converge?
gm_liberty.converged_     # True


In [None]:
#   Perform clustering using Gaussian Mixture Modeling.
# Clusters labels
gm_liberty.predict(X)



In [None]:
#v)   Perform clustering using Gaussian Mixture Modeling.
# How many iterations did they perform?
gm_liberty.n_iter_      #  9

In [None]:
#   Perform clustering using Gaussian Mixture Modeling.
#  What is the frequency of data-points
#       for the three clusters. (np.unique()
#       ouputs a tuple with counts at index 1)

np.unique(gm_liberty.predict(X), return_counts = True)[1]/len(X)

In [None]:
#   Perform clustering using Gaussian Mixture Modeling.
# GMM is a generative model.
#     Generate a sample from each cluster
#     ToDo: Generate digits using MNIST

gm_liberty.sample()

In [None]:
#   Perform clustering using Gaussian Mixture Modeling.
# Plot cluster and cluster centers
#     both from kmeans and from gmm

fig = plt.figure()

plt.scatter(X[:, 0], X[:, 1],
            c=gm_liberty.predict(X),
            s=2)

plt.scatter(gm_liberty.means_[:, 0], gm_liberty.means_[:, 1],
            marker='v',
            s=5,               # marker size
            linewidths=5,      # linewidth of marker edges
            color='red'
            )
plt.show()

In [None]:
# Lookup anomalous customers and try to understand their behavior.
#     Anomaly detection
#     Anomalous points are those that
#     are in low-density region
#     Or where density is in low-percentile
#     of 4%
#     score_samples() method gives score or
#     density of a point at any location.
#     Higher the value, higher its density

densities = gm_liberty.score_samples(X)
densities

In [None]:
# Lookup anomalous customers and try to understand their behavior.
density_threshold = np.percentile(densities,4)
density_threshold # 5.4128688273335195

In [None]:
# Lookup anomalous customers and try to understand their behavior.
anomalies = X[densities < density_threshold]
anomalies
anomalies.shape

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Show anomalous points
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c = gm_liberty.predict(X))
plt.scatter(anomalies[:, 0], anomalies[:, 1],
            marker='x',
            s=50,               # marker size
            linewidths=5,      # linewidth of marker edges
            color='red'
            )
plt.show()

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Get first unanomalous data
unanomalies = X[densities >= density_threshold]
unanomalies.shape    # (108174, 303)

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Transform both anomalous and unanomalous data
#     to pandas DataFrame
#df_anomalies = pd.DataFrame(anomalies, columns = ['x', 'y', 'p'])
df_anomalies = pd.DataFrame(anomalies)
df_anomalies['z'] = 'anomalous'   # Create a IIIrd constant columna
#df_normal = pd.DataFrame(unanomalies, columns = ['x','y', 'p'])
df_normal = pd.DataFrame(unanomalies)
df_normal['z'] = 'unanomalous'    # Create a IIIrd constant column


In [None]:
df_anomalies.columns

df_normal.head()

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Let us see density plots
sns.distplot(df_anomalies[0])
sns.distplot(df_normal[0])

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Let us see density plots
sns.distplot(df_anomalies[301])
sns.distplot(df_normal[301])

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Let us see density plots
sns.distplot(df_anomalies[100])
sns.distplot(df_normal[100])

In [None]:
# Lookup anomalous customers and try to understand their behavior.
# Draw side-by-side boxplots
# Ist stack two dataframes
df = pd.concat([df_anomalies,df_normal])
# Draw featurewise boxplots
sns.boxplot(x = df['z'], y = df[0])
sns.boxplot(x = df['z'], y = df[301])
sns.boxplot(x = df['z'], y = df[100])

In [None]:
#vi)  Use aic and bic measures to draw a scree plot and discover ideal number of clusters
start = time.time()

bic = []
aic = []
for i in range(3):
    gm2 = GaussianMixture(
                     n_components = i+1,
                     n_init = 10,
                     max_iter = 100)
    gm2.fit(X)
    bic.append(gm2.bic(X))
    aic.append(gm2.aic(X))
    
end = time.time()
(end - start)/60     # 17.740078067779542 minutes

In [None]:
#vi)  Use aic and bic measures to draw a scree plot and discover ideal number of clusters
fig = plt.figure()
plt.plot([1,2,3], aic)
plt.plot([1,2,3], bic)
plt.show()

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
# t-stochaistic neighbourhood embedding
#     Even though data is already in 2-dimension,
#     for the sake of completion, 
#     darwing a 2-D t-sne plot and colour
#     points by gmm-cluster labels
start = time.time()
tsne = TSNE(n_components = 3, perplexity = 30)
tsne_out = tsne.fit_transform(X)
plt.scatter(tsne_out[:, 0], tsne_out[:, 1],
            marker='o',
            s=50,              # marker size
            linewidths=5,      # linewidth of marker edges
            c=gm2.predict(X)   # Colour as per gmm
            )
plt.title('t-SNE visualization');
end = time.time()
(end - start)/60     # 14.636476087570191 minutes

#### This block is taking more than 2 hours to complete

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

In [None]:
test.head()

In [None]:
#  Classification

ITERATIONS=10   # Decides how many param-combinations can be tested

# Define parameter range
params ={
        'dummy_enc': Integer(0,1),
        'var_enc_cols_mean': Real(0, 1000000, 'log-uniform'),
        'crime_var_cols_mean': Real(1e-9, 1.0, 'log-uniform'),
        'geodem_var_cols_mean': Real(1e-9, 10, 'log-uniform'),      
        'weather_var_cols_mean': Real(1e-9, 10, 'log-uniform')
    }
    

In [None]:

# This code is used to over come the error 
# TypeError: __init__() got an unexpected keyword argument 'iid'
def bayes_search_CV_init(self, estimator, search_spaces, optimizer_kwargs=None,
                         n_iter=50, scoring=None, fit_params=None, n_jobs=1,
                         n_points=1, iid=True, refit=True, cv=None, verbose=0,
                         pre_dispatch='2*n_jobs', random_state=None,
                         error_score='raise', return_train_score=False):

        self.search_spaces = search_spaces
        self.n_iter = n_iter
        self.n_points = n_points
        self.random_state = random_state
        self.optimizer_kwargs = optimizer_kwargs
        self._check_search_space(self.search_spaces)
        self.fit_params = fit_params

        super(BayesSearchCV, self).__init__(
             estimator=estimator, scoring=scoring,
             n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score,
             return_train_score=return_train_score)
        
BayesSearchCV.__init__ = bayes_search_CV_init

In [None]:
# Drop in replacement for GridSearchCV
# https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html
bayes_cv_tuner = BayesSearchCV(
                               # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
                               # lso specify 'fixed' parameter values
                               estimator = xgb.XGBClassifier(
                                                             n_jobs = 3,
                                                             objective = 'binary:logistic',
                                                             eval_metric = 'auc', # Not relevant here
                                                                                  # See comments below
                                                             tree_method='approx' 
                                                                     # 'hist' is one fast approx method
                                                            ),
                              search_spaces = params,    
                                  scoring = 'roc_auc',
                              cv = StratifiedKFold(
                                                    n_splits=3,
                                                    shuffle=True,
                                                   ),
                              n_jobs = 3,
                              n_iter = ITERATIONS,   
                              verbose = 1,
                              refit = True
                             )


In [None]:
#Remove BayesSearchCV(iid=) parameter 
#  Define a callback function
def status_print(optim_result):
    """Status callback during bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)
    #print(optim_result)
    # print (all_models)
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model: {}\n            \
           Best ROC-AUC: {}\n     \
           Best params: {}\n      \
           '.format(            
                     len(all_models), # no of rows
                     np.round(bayes_cv_tuner.best_score_, 4),
                     bayes_cv_tuner.best_params_
                    )
          
         )
    
    # Save all model results
    all_models.to_csv("myresults_cv_results.csv")

In [None]:
X.shape
test.shape
test.head()

In [None]:
X[:5, 298:303]

In [None]:

# Finally fit the model
start = time.time()
result = bayes_cv_tuner.fit(X[:, : ],
                            y_pred,
                            callback=[status_print] # callback = [list_of_callables] is called after each 
                                                    #   parameter combination tested.
                           )
end = time.time()
(end-start)/60

In [None]:
######### Done ##################