In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dask.dataframe as dd
pd.options.mode.chained_assignment = None  # default='warn'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Measure memory

In [None]:
# https://gdmarmerola.github.io/big-data-ml-training/
# https://github.com/gdmarmerola/big-data-ml-training/blob/master/track_memory.py

# libs to help us track memory via sampling
import numpy as np
import tracemalloc
from time import sleep
import matplotlib.pyplot as plt

# sampling time in seconds
SAMPLING_TIME = 0.001

class MemoryMonitor:
    def __init__(self, close=True):
        
        # start tracemalloc and sets
        # measurement atribute to True
        tracemalloc.start()
        self.keep_measuring = True
        self.close = close
        
    def measure_usage(self):
        
        """
        Takes measurements of used memory on
        regular intevals determined by the 
        global SAMPLING_TIME constant
        """
        
        # list to store memory usage samples
        usage_list = []
        
        # keeps going until someone changes this parameter to false
        while self.keep_measuring:
            
            # takes a sample, stores it in the usage_list and sleeps
            current, peak = tracemalloc.get_traced_memory()
            usage_list.append(current/1e6)
            sleep(SAMPLING_TIME)
            
        # stop tracemalloc and returns list
        if self.close:
            tracemalloc.stop()
        return usage_list

# imports executor
from concurrent.futures import ThreadPoolExecutor
from functools import wraps

def plot_memory_use(history, fn_name, open_figure=True, offset=0, **kwargs):
    
    """Function to plot memory use from a history collected
        by the MemoryMonitor class
    """

    # getting times from counts and sampling time
    times = (offset + np.arange(len(history))) * SAMPLING_TIME
    
    # opening figure and plotting
    if open_figure:
        plt.figure(figsize=(10,3), dpi=120)
    plt.plot(times, history, 'k--', linewidth=1)
    plt.fill_between(times, history, alpha=0.5, **kwargs)
    
    # axes titles
    plt.ylabel('Memory usage [MB]')
    plt.xlabel('Time [seconds]')
    plt.title(f'{fn_name} memory usage over time')
    
    # legend
    plt.legend();

def track_memory_use(plot=True, close=True, return_history=False):
    
    def meta_wrapper(fn):
    
        """
        This function is meant to be used as a decorator
        that informs wrapped function memory usage
        """
        
        # decorator so we can retrieve original fn
        @wraps(fn)
        def wrapper(*args, **kwargs):

            """
            Starts wrapped function and holds a process 
            to sample memory usage while executing it
            """

            # context manager for executor
            with ThreadPoolExecutor() as executor:

                # start memory monitor
                monitor = MemoryMonitor(close=close)
                mem_thread = executor.submit(monitor.measure_usage)

                # start wrapped function and get its result
                try:
                    fn_thread = executor.submit(fn, *args, **kwargs)
                    fn_result = fn_thread.result()

                # when wrapped function ends, stop measuring
                finally:
                    monitor.keep_measuring = False
                    history = mem_thread.result()

                # inform results via prints and plot
                print(f'Current memory usage: {history[-1]:2f}')
                print(f'Peak memory usage: {max(history):2f}')
                if plot:
                    plot_memory_use(history, fn.__name__)
            if return_history:
                return fn_result, history
            else:
                return fn_result

        return wrapper
    
    return meta_wrapper

In [None]:
import multiprocessing
# using a function so we can track memory usage
@track_memory_use(close=False, return_history=False)
def pandas_read():
    
    # reading train data
    train_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
    test_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
    ddX = dd.from_pandas(train_df.drop(['Survived'], axis=1), npartitions=4*multiprocessing.cpu_count())
    ddy = dd.from_pandas(train_df.Survived, npartitions=4*multiprocessing.cpu_count())
    test_df = dd.from_pandas(test_df, npartitions=4*multiprocessing.cpu_count())
    
#     X = train_df.drop(['Survived'], axis=1)
#     y = train_df.Survived
    return train_df, test_df, ddX, ddy

# executing
train_df, test_df, X, y = pandas_read()

In [None]:
numerical_features = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
numerical_features.remove('Pclass')
print('\nNumerical columns:', numerical_features)

In [None]:
categorical_features = [col for col in X.columns if X[col].dtype == "object"] + ['Pclass']
print('\nCategorical columns:', categorical_features)

# Create Transformers

In [None]:
import numpy as np 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
import string

#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self.feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self.feature_names ] 

## Categorical Transformers
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [None]:
#Custom transformer 
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    ### Helper Functions
    def sex_label_encoder(self, sex):
        if(sex == 'male'):
            return 0
        elif(sex == 'female'):
            return 1
        return 2 # fallback - unknown sex
    
    #Helper function to extract cabin_prefix from column 'cabin' 
    def get_cabin_prefix(self, obj):
        return str(obj)[0].upper()
#         if (isinstance(obj, str)):
#             return obj[0]
#         return obj # fallback for np.nan (float type) --> missing value
        
#         try:
# #             return obj[0]
#             if (obj[0] == 'n'):
#                 return obj
#             return str(obj)[0]
#         except Exception as e: # handle np.nan
#             return e
# #             return 'U' # stands for unkown cabin
    
    def remove_bracket_from_name(self, name):
        if('(' in name):
            name_no_bracket = name.split('(')[0] 
        else:
            name_no_bracket = name
        return name_no_bracket
    
    def extract_title_from_name(self, name):
        try:
            title = name.split(',')[1].strip().split(' ')[0].replace('.','')
        except:
            title = ""
        return title
    
    def extract_family_from_name(self, name):    
        family = name.split(',')[0]
        for c in string.punctuation:
            family = family.replace(c, '').strip()
        return family
    
    def correct_title_names(self, title):
        if(title == 'Mlle'):
            return 'Miss'
        elif(title == 'Ms'):
            return 'Miss'
        elif(title == 'Mme'):
            return 'Mrs'
        return title
    
    def clean_rare_title_names(self, X): 
#     def clean_rare_title_names(self): 
        stat_min = 10 #common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
        title_names = (X['Title'].value_counts() < stat_min)
        return X['Title'].apply(lambda x: 'Rare' if title_names.loc[x] == True else x)
    
    def extract_first_name(self, name: str) -> int:
        return len(name.split(',')[0])
    def extract_last_name(self, name:str) -> int:
        return len(name.split(',')[1][1:])
        
    def extract_ticket_alphabetic_code(self, ticket):
        import re
        ticket_code = re.sub(r'[^\w\s]', '', str(ticket))
        ticket_code = ticket_code.replace(' ', '')
        ticket_code = re.sub(r'(\d)', '', ticket_code)
        if (ticket_code == np.nan or ticket_code == '' or ticket_code == 'nan'):
            ticket_code = 'NA'            
        return ticket_code
        
    def extract_ticket_number(self, ticket):
        import re            
        ticket = str(ticket)
        ticket_search = re.search(r'(\d+)', ticket)
        if ticket_search:
            ticket_number = ticket_search.group(1)
        else:
            ticket_number = str(0)
        return ticket_number
#                 combine['TicketCode'] = combine['Ticket'].str.replace('[^\w\s]','')
# combine['TicketCode'] = combine['TicketCode'].str.replace(' ','')
# combine['TicketCode'] = combine['TicketCode'].fillna('NA')
# combine['TicketCode'] = combine['TicketCode'].replace('(\d)', '', regex=True)
    
    #Transformer method we wrote for this transformer 
    @track_memory_use(close=False, return_history=False)
    def transform(self, X , y = None ):
#         X.Sex = X.Sex.apply(func = self.sex_label_encoder, meta=('Sex', 'int64'))
        # Label encode Sex
        X.Sex = X.Sex.map_partitions(lambda row: row.apply(lambda sex: self.sex_label_encoder(sex)), meta=('Sex', 'int64'))
        # Create Cabin_Prefix
        X['Cabin_Prefix'] = X.Cabin.map_partitions(lambda row: row.apply(lambda cabin: self.get_cabin_prefix(cabin)), meta=('Cabin_Prefix', 'O'))
        
        # https://www.kaggle.com/dwin183287/tps-april-2021-models-feature-enginering/execution#4.3.6.-First-name-and-last-Name---Continuous
        # Name
        X['First_Name'] = X.Name.map_partitions(lambda row: row.apply(lambda name: self.extract_first_name(name)), meta=('First_Name', 'i8'))
        X['Last_Name'] = X.Name.map_partitions(lambda row: row.apply(lambda name: self.extract_last_name(name)), meta=('Last_Name', 'O'))
        
        # https://www.kaggle.com/dwin183287/tps-april-2021-models-feature-enginering/execution#4.4.4.-Ticket-Code---Categorical-&-Ticket-Number---Continuous
        # Ticket
        X['Ticket_Alphabetic_Code'] = X.Ticket.map_partitions(lambda row: row.apply(lambda ticket: self.extract_ticket_alphabetic_code(ticket)), meta=('Ticket_Alphabetic_Code', 'O'))
#         X['TicketNumber'] = X.Ticket.map_partitions(lambda row: row.apply(lambda ticket: self.extract_ticket_number(ticket)), meta=('TicketNumber', 'O'))
        

# combine['TicketNumber'] = combine['Ticket'].str.extract('(\d+)')
# combine['TicketNumber'] = combine['TicketNumber'].astype(float)
# combine['TicketNumber'] = combine['TicketNumber'].fillna(0)
# #         # normalize Name
#         X.Name = X.Name.map_partitions(lambda row: row.apply(lambda name: self.remove_bracket_from_name(name)), meta=('Name', 'O'))
#         # create Family_Name
#         X['Family_Name'] = X.Name.map_partitions(lambda row: row.apply(lambda name: self.extract_family_from_name(name)), meta=('Family_Name', 'O'))
    
    
    
    
    
    
    
# #         # create Title
#         X['Title'] = X.Name.map_partitions(lambda row: row.apply(lambda name: self.extract_title_from_name(name)), meta=('Title', 'O'))
# #         # correct title names
#         X.Title = X.Title.map_partitions(lambda row: row.apply(lambda title: self.correct_title_names(title)), meta=('Title', 'O'))
        
        # clean rare title names
#         X.Title = X.map_partitions(lambda df: self.clean_rare_title_names(df))
    
                           


        
#         X.Sex = X.Sex.map_partitions(lambda row: row.apply(lambda sex: self.sex_label_encoder(sex)), meta=('Sex', 'int64'))
#         X.Sex = X.Sex.map_partitions(lambda row: row.apply(lambda sex: self.sex_label_encoder(sex)), meta=('Sex', 'int64'))
#         X.Sex = X.Sex.map_partitions(lambda row: row.apply(lambda sex: self.sex_label_encoder(sex)), meta=('Sex', 'int64'))
    
    
    
#         X.Sex = X.\
#                 map_partitions(\
#                                lambda df: df.apply\
#                                     ((lambda row: self.sex_label_encoder(row.Sex)), axis = 1)\
#                                , meta=X)
# #                                , meta=pd.Series([], dtype=int, name='Sex'))
        
#        #using the helper functions written above 
#         # label encode sex
#         X.loc[:, 'Sex'] = X['Sex'].apply( self.sex_label_encoder )
#         # Create Cabin_Prefix
#         X.loc[:,'Cabin_Prefix'] = X['Cabin'].apply( self.get_cabin_prefix )
#         # normalize Name
#         X.loc[:, 'Name'] = X['Name'].apply( self.remove_bracket_from_name )
#         # create Title
#         X.loc[:, 'Title'] = X['Name'].apply( self.extract_title_from_name )
#         # correct title names
#         X.loc[:,'Title'] = X['Title'].apply( self.correct_title_names )
#         # clean rare title names
# #         X.loc[:,'Title'] = X['Title'].apply( self.clean_rare_title_names )
# #         X.loc[:,'Title'] = self.clean_rare_title_names(X)
#         # create Family_Name
#         X.loc[:, 'Family_Name'] = X['Name'].apply( self.extract_family_from_name )
#         # create Ticket_Frequency
#         X.loc[:, 'Ticket_Frequency'] = X.groupby('Ticket')['Ticket'].transform('count')
       
       # drop columns
        X = X.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
       # debugging Dask:
        X = X.compute(scheduler='processes')
       #returns numpy array
        return X.values

## Numerical Transformers

In [None]:
#Custom transformer
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self ):
        pass
        
    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    def create_family_size(self, df):
        return df.SibSp + df.Parch + 1
    
    def create_calculated_fare(self, df):
        return df.Fare / df.Family_Size
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    @track_memory_use(close=False, return_history=False)
    def transform(self, X, y = None):
        # create Family_Size
        X['Family_Size'] = X.map_partitions(lambda df: self.create_family_size(df), meta=('Family_Size', 'int64'))
        # create Calculated_Fare                                 
#         X['Calculated_Fare'] = X.map_partitions(lambda df: self.create_calculated_fare(df), meta=('Calculated_Fare', 'int64'))
#         # create Calculated_Fare
#         X.loc[:, 'Calculated_Fare'] = X['Fare'] / X['Family_Size']
#         # drop redundant features
#         X = X.drop(['PassengerId', 'Fare'], axis=1)
        X = X.drop(['PassengerId'] ,axis = 1) # 'Age', 'SibSp', 'Parch', 
#         X = X.drop(['PassengerId', 'Fare'] ,axis = 1) # 'Age', 'SibSp', 'Parch', 
        # debugging Dask:
        X = X.compute(scheduler='processes')
        
        #returns a numpy array
        return X.values

# Pipeline

In [None]:
print('\nCategorical columns:', categorical_features)
print('\nNumerical columns:', numerical_features)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline( steps = \
    [ ( 'cat_selector', FeatureSelector(categorical_features) ),
      ( 'cat_transformer', CategoricalTransformer() ), 
      ( 'imputer', SimpleImputer(strategy = 'most_frequent') ),
     ( 'one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse = False ) ),
     #PCA Worse score
# https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/
#      ( 'min_max_scaler', MinMaxScaler(copy = False)),
#      ( 'PCA', PCA(n_components = 0.95)) # PCA(n_components = 0.99)
     ])


#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = \
  [ ( 'num_selector', FeatureSelector(numerical_features) ),
    ( 'num_transformer', NumericalTransformer() ),                                  
    ( 'imputer', SimpleImputer(strategy = 'median') ),
    ( 'std_scaler', MinMaxScaler()), # MinMaxScaler, StandardScaler, RobustScaler # other scalers has no affect
    ( 'power_transform', PowerTransformer(method = 'yeo-johnson'))
   ])

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion( transformer_list = \
    [ ( 'categorical_pipeline', categorical_pipeline ),         
      ( 'numerical_pipeline', numerical_pipeline ) 
    ])

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
# random_state = 42)

In [None]:
# https://stackoverflow.com/a/54363480 - DEBUG --> CHANGE Dask loc assignment
# https://stackoverflow.com/a/38776838 - DEBUG --> Change df.loc[row, column] syntax
# full_pipeline.fit(X, y)

In [None]:
full_pipeline.fit(X, y)

In [None]:
X_transformed = pd.DataFrame(full_pipeline.transform(X))
y_transformed = y.compute()
X_transformed

In [None]:
X_transformed.head(50)

In [None]:
# a.loc[:, a.columns[3]].unique()

# XGBoost Modeling

In [None]:
# !pip install dask_ml
# ! python -m pip install --upgrade dask
# ! python -m pip install fsspec
# ! python -m pip install --upgrade joblib

In [None]:
# from dask_ml.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 42)
from sklearn.model_selection import train_test_split
random_state = 42
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y_transformed, train_size=0.8, test_size=0.2, random_state = random_state, shuffle=True)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# # Define the model
# clf = XGBClassifier(n_estimators = 1000, learning_rate = 0.05, eval_metric = 'error') # Your code here

# params = {'booster': ['gbtree', 'gblinear', 'dart']}

# class XGBTransformer(BaseEstimator, TransformerMixin):
#     #Class Constructor
#     def __init__( self ):
#         pass
        
#     #Return self, nothing else to do here
#     def fit( self, X, y = None):
#         return self 
    
#     @track_memory_use(close=False, return_history=False)
#     def transform(self, X, y = None):
#         return xgb.DMatrix(X)

# xgb_pipeline = Pipeline(steps=[\
#                                 ('data_wrangling', full_pipeline),
#                                 ('xgb_dmatrix', XGBTransformer())
#                                ])
# xgb_pipeline.fit(X, y)
# X_DMatrix = xgb_pipeline.transform(X)

# Dask HyperbandSearchCV

In [None]:
# ###### WORKING
# # https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning
# from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
# from sklearn.metrics import accuracy_score
# from hyperopt.pyll import scope
# import warnings
# warnings.filterwarnings("ignore")

# space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
#         'gamma': hp.uniform ('gamma', 1,9),
#         'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#         'reg_lambda' : hp.uniform('reg_lambda', 0,1),
#         'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
#         'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
#         'n_estimators': scope.int(hp.quniform('n_estimators', 100, 300, q=1)),
#         'seed': 0,
#         'learning_rate' : hp.uniform('learning_rate', 0.01,0.1),
#         'booster' : [None, 'gbtree', 'gblinear', 'dart']
#     }

# def objective(space):
#     clf=xgb.XGBClassifier(
#                     n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
#                     reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
#                     colsample_bytree=int(space['colsample_bytree']), random_state = 42)
    
#     evaluation = [( X_train, y_train), ( X_valid, y_valid)]
    
#     clf.fit(X_train, y_train,
#             eval_set=evaluation, eval_metric="auc",
#             early_stopping_rounds=10,verbose=False)
    

#     pred = clf.predict(X_valid)
#     accuracy = accuracy_score(y_valid, pred>0.5)
#     print ("SCORE:", accuracy)
#     return {'loss': -accuracy, 'status': STATUS_OK }

# trials = Trials()

# best_hyperparams = fmin(fn = objective,
#                         space = space,
#                         algo = tpe.suggest,
#                         max_evals = 1000,
#                         trials = trials,
#                         rstate=np.random.RandomState(42))

In [None]:
# from pprint import pprint
# print('best loss: -0.77535\n')
# print("The best hyperparameters are : ")
# pprint(best_hyperparams)

# # best loss: -0.77535

# # The best hyperparameters are : 
# # {'colsample_bytree': 0.8378618694781489,
# #  'gamma': 2.8242880151334067,
# #  'learning_rate': 0.07069892867801585,
# #  'max_depth': 14.0,
# #  'min_child_weight': 5.0,
# #  'n_estimators': 202.0,
# #  'reg_alpha': 40.0,
# #  'reg_lambda': 0.9194811165570513}

In [None]:
# from pprint import pprint
# print('best loss: -0.77535\n')
# print("The best hyperparameters are : ")
# pprint(best_hyperparams)

# best loss: -0.77535

# The best hyperparameters are : 
# {'colsample_bytree': 0.8585598855009294,
#  'gamma': 1.3997603558344998,
#  'learning_rate': 0.03708950942418698,
#  'max_depth': 10.0,
#  'min_child_weight': 7.0,
#  'n_estimators': 231.0,
#  'reg_alpha': 90.0,
#  'reg_lambda': 0.44294659050314755}

In [None]:
# clf = xgb.XGBClassifier().set_params(**best_hyperparams)

# Double Cross-Validation / Nested Cross-Validation

In [None]:
######### WORKING HYPER PARAM SELECTION
from sklearn.model_selection import KFold, cross_validate

random_state=42
n_iter = 100# 50
num_folds=10
kf = KFold(n_splits=num_folds, shuffle = True,random_state=random_state)

In [None]:
# ######### WORKING HYPER PARAM SELECTION

# https://www.kaggle.com/ilialar/hyperparameters-tunning-with-hyperopt#Hyperopt
from sklearn.model_selection import KFold, cross_validate, cross_val_score

def xgb_clf_acc_f1_cv(params, random_state = random_state, cv=kf, X=X_train, y=y_train):
    # the function gets a set of variable parameters in "param"
    params = {'max_depth': int(params['max_depth']), 
              'gamma': params['gamma'], 
              'reg_alpha': params['reg_alpha'], 
              'reg_lambda': params['reg_lambda'], 
              'colsample_bytree': params['colsample_bytree'], 
              'min_child_weight': int(params['min_child_weight']), 
              'n_estimators': int(params['n_estimators']), 
              'seed': int(params['seed']), 
              'learning_rate': params['learning_rate'],
#               'booster': params['booster'],
             }
    
    # we use this params to create a new LGBM Regressor
    model = xgb.XGBClassifier(random_state = random_state, **params)

    # https://stackoverflow.com/a/35886445/15830024
#     scoring = {'acc': 'accuracy',
#                'f1': 'f1'
#               }
    scoring = 'accuracy'
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1).mean()

    return score

In [None]:
# ######### WORKING HYPER PARAM SELECTION

from sklearn.metrics import f1_score, accuracy_score
from hyperopt.pyll import scope
import warnings
warnings.filterwarnings("ignore")

# possible values of parameters
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': scope.int(hp.quniform('n_estimators', 100, 300, q=1)),
        'seed': 0,
        'learning_rate' : hp.uniform('learning_rate', 0.01,0.1),
#         'booster' : [None, 'gbtree', 'gblinear', 'dart']
    }

# trials will contain logging information
trials = Trials()

best=fmin(fn=xgb_clf_acc_f1_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )

# computing the score on the test set
model = xgb.XGBClassifier(
                    n_estimators = int(space['n_estimators']), 
                    max_depth = int(space['max_depth']), 
#                     gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    reg_lambda = int(space['reg_lambda']),
                    min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']),
                    learning_rate = space['learning_rate'],
                    seed = int(space['seed']),
#                     booster = space['booster'],
                    random_state = int(random_state)
                    )
model.fit(X_train,y_train)
# tpe_test_f1_score=f1_score(y_valid, model.predict(X_valid))
tpe_test_accuracy_score=accuracy_score(y_valid, model.predict(X_valid))

# print("Best f1 score {:.3f} params {}".format( tpe_test_f1_score(best), best))
print("Best accuracy score {:.3f} params {}".format( tpe_test_accuracy_score(best), best))

In [None]:
# best

In [None]:
best

In [None]:
# ######### WORKING HYPER PARAM SELECTION

tpe_results=np.array([[x['result']['loss'],
                      x['misc']['vals']['learning_rate'][0],
                      x['misc']['vals']['max_depth'][0],
                      x['misc']['vals']['n_estimators'][0],
                      x['misc']['vals']['min_child_weight'][0],
                      x['misc']['vals']['colsample_bytree'][0],
                      x['misc']['vals']['reg_alpha'][0],
                      x['misc']['vals']['reg_lambda'][0]]
                      for x in trials.trials])

try:
    tpe_results_df=pd.DataFrame(tpe_results,
                           columns=['score', 'learning_rate', 'max_depth', 'n_estimators', 'min_child_weight', 'colsample_bytree', 'reg_alpha', 'reg_lambda'])
    tpe_results_df.plot(subplots=True,figsize=(10, 10))
except:
    pass

In [None]:
# # https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/

# # manual nested cross-validation for random forest on a classification dataset
# from numpy import mean
# from numpy import std
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# # configure the cross-validation procedure
# cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# # enumerate splits
# outer_results = list()
# for train_ix, test_ix in cv_outer.split(X_transformed):
#     # split data
#     X_train, X_test = X[train_ix, :], X[test_ix, :]
#     y_train, y_test = y[train_ix], y[test_ix]
#     # configure the cross-validation procedure
#     cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
#     # define the model
#     model = xgb.XGBClassifier(
#                     n_estimators =space['n_estimators'], 
#                     max_depth = int(space['max_depth']), 
#                     gamma = space['gamma'],
#                     reg_alpha = int(space['reg_alpha']),
#                     min_child_weight=int(space['min_child_weight']),
#                     colsample_bytree=int(space['colsample_bytree']),
#                     random_state = 42)
#     # define search space
#     space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
#         'gamma': hp.uniform ('gamma', 1,9),
#         'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#         'reg_lambda' : hp.uniform('reg_lambda', 0,1),
#         'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
#         'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
#         'n_estimators': scope.int(hp.quniform('n_estimators', 100, 300, q=1)),
#         'seed': 0,
#         'learning_rate' : hp.uniform('learning_rate', 0.01,0.1),
#         'booster' : [None, 'gbtree', 'gblinear', 'dart']
#     }
#     # define search
#     search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
#     # execute search
#     result = search.fit(X_train, y_train)
#     # get the best performing model fit on the whole training set
#     best_model = result.best_estimator_
#     # evaluate model on the hold out dataset
#     yhat = best_model.predict(X_test)
#     # evaluate the model
#     acc = accuracy_score(y_test, yhat)
#     # store the result
#     outer_results.append(acc)
#     # report progress
#     print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# # summarize the estimated performance of the model
# print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))

In [None]:
# from distributed import Client
# client = Client(processes=False, n_workers=1, memory_limit='16GB')
# client

In [None]:
# client

In [None]:
# from dask_ml.model_selection import HyperbandSearchCV

# clf = XGBClassifier(n_estimators = 1000, learning_rate = 0.05, eval_metric = 'error') # Your code here
# model = clf
# params = {}
# space={'max_depth': range(3, 18),
#         'gamma': range(1,9),
#         'reg_alpha' : range('reg_alpha', 40,180),
#         'reg_lambda' : range(0,1),
#         'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
#         'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
#         'n_estimators': scope.int(hp.quniform('n_estimators', 100, 300, q=1)),
#         'seed': 0,
#         'learning_rate' : hp.uniform('learning_rate', 0.01,0.1),
#         'booster' : [None, 'gbtree', 'gblinear', 'dart']
#     }

# n_examples = 15 * len(X_train)
# n_params = 15

# max_iter = n_params  # number of times partial_fit will be called
# chunks = n_examples // n_params  # number of examples each call sees

# print((max_iter, chunks))
# search = HyperbandSearchCV(
#     model,
#     params,
#     max_iter=max_iter,
#     patience=True,
# )
# search.metadata["partial_fit_calls"]

In [None]:
# %%time
# search.fit(X_train_dask, y_train_dask, classes=[0, 1])

In [None]:
# before_mean_train_score, before_mean_test_score, before_mean_test_score_std = [], [], []
# after_mean_train_score, after_mean_test_score, after_mean_test_score_std = [], [], []
# best_param = []
# from tqdm import tqdm
# for model_name, model in tqdm(MLA_dict.items()):
#     base_results = model_selection.cross_validate(model, train_df[train_df_x_calc], train_df[Target], cv  = cv_split, return_train_score = True)
#     model.fit(train_df[train_df_x_calc], train_df[Target])
#     before_mean_train_score.append(base_results['train_score'].mean()*100)
#     before_mean_test_score.append(base_results['test_score'].mean()*100)
#     before_mean_test_score_std.append(base_results['test_score'].std()*100*3)
    
#     param_grid = parameters_dict[model_name]
#     print("")
#     print(model_name)
#     print(param_grid)
#     tune_model = model_selection.GridSearchCV(model, param_grid=param_grid, scoring = 'roc_auc', cv = cv_split, return_train_score = True)
#     tune_model.fit(train_df[train_df_x_calc], train_df[Target])
#     try:
#         print("Best parameters for model: " + model_name)
#     except Exception as e:
#         pass
#     print(tune_model.best_params_)
    
#     after_mean_train_score.append(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)
#     after_mean_test_score.append(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100)
#     after_mean_test_score_std.append(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3)
#     best_param.append(tune_model.best_params_)
# print("DONE")


In [None]:
# param = {}
# param['booster'] = 'gbtree'
# param['objective'] = 'binary:logistic'
# param["eval_metric"] = "error"
# param['eta'] = 0.3
# param['gamma'] = 0
# param['max_depth'] = 6
# param['min_child_weight']=1
# param['max_delta_step'] = 0
# param['subsample']= 1
# param['colsample_bytree']=1
# param['silent'] = 0
# param['seed'] = 0
# param['base_score'] = 0.5
# param['random_state'] = 42
# param['learning_rate'] = 0.05
# param['n_estimators'] = 1000

# clf = XGBClassifier(use_label_encoder=False)
# clf.set_params(**param)
# full_pipeline.fit(X, y)
# # full_pipeline.transform(X_train)
# clf.fit(pd.DataFrame(full_pipeline.transform(X)), y.compute())
# # clf.fit(pd.DataFrame(full_pipeline.transform(X_train)), y_train.compute())
# # clf.score(pd.DataFrame(full_pipeline.transform(X_valid)), y_valid.compute())

In [None]:
# param = {'colsample_bytree': 0.8585598855009294,
#  'gamma': 1.3997603558344998,
#  'learning_rate': 0.03708950942418698,
#  'max_depth': 10,
#  'min_child_weight': 7,
#  'n_estimators': 231,
#  'reg_alpha': 90,
#  'reg_lambda': 0.44294659050314755}
clf = XGBClassifier(eval_metric = 'error')
clf.set_params(**best) # param
clf.fit(pd.DataFrame(full_pipeline.transform(X)), y.compute())

# make predictions which we will submit. 
test_preds = clf.predict(pd.DataFrame(full_pipeline.transform(test_df)))

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'PassengerId': test_df.compute().PassengerId,
                      'Survived': test_preds})
output.to_csv('submission_gpu.csv', index=False)
output

In [None]:
# output.to_csv('submission_6.csv', index=False)

In [None]:
# from xgboost import XGBClassifier
# from sklearn.metrics import mean_absolute_error

# # Define the model
# clf = XGBClassifier(n_estimators = 1000, learning_rate = 0.05, eval_metric = 'error') # Your code here

# # # Fit the model
# clf.fit(X_train, y_train, early_stopping_rounds=5, 
#              eval_set=[(X_valid, y_valid)], 
#              verbose=False) 

# # # Get predictions
# predictions = clf.predict(X_valid) 

# # # Calculate MAE
# mae = mean_absolute_error(y_valid, predictions) # Your code here

# # # Uncomment to print MAE

# print("Mean Absolute Error:" , mae)

In [None]:
# from sklearn.model_selection import cross_val_score

# # import joblib



# model_pipeline = Pipeline( steps = \
# [ ( 'data_processing', X_transformed),
#     ( 'num_transformer', NumericalTransformer() ),                                  
#     ( 'imputer', SimpleImputer(strategy = 'median') ),
#     ( 'std_scaler', StandardScaler()),
#    # MinMaxScaler, StandardScaler, RobustScaler
#     ( 'power_transform', PowerTransformer(method = 'yeo-johnson'))
#    ])

# full_pipeline = FeatureUnion( transformer_list = \
#     [ ( 'full_pipeline', full_pipeline ),         
#       ( 'model_pipeline', model_pipeline ) 
#     ])

# from dask.distributed import Client
# import joblib

# client = Client(processes=False)             # create local cluster
# # client = Client("scheduler-address:8786")  # or connect to remote cluster

# with joblib.parallel_backend('dask', scatter = [X, y]):
#     cross_val_score(full_pipeline, X, y)

# # # Multiply by -1 since sklearn calculates *negative* MAE
# # scores = -1 * cross_val_score(full_pipeline, X, y_transformed,
# #                               cv=5,
# #                               scoring='neg_mean_absolute_error')

# # print("Average MAE score:", scores.mean())

# ERROR - pipeline.transform(X)
## Your notebook tried to allocate more memory than is available. It has restarted.
What's wrong with the transformers' loc[:, column_name]?  
How should I avoid:  
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

In [None]:
### ERROR ###
# full_pipeline.transform(X)
### ERROR ###

In [None]:
# XGBoost Optimization
# https://gist.github.com/dirusali
# import xgboost as xgb

# class XGBTransformer(BaseEstimator, TransformerMixin):
#     #Class Constructor
#     def __init__( self ):
#         pass
        
#     #Return self, nothing else to do here
#     def fit( self, X, y = None ):
#         return self 
    
#     #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
#     def transform(self, X, y = None):
#         return xgb.DMatrix(X, label=y)

# cat_num_pipeline = FeatureUnion( transformer_list = [\
#                                                   ( 'categorical_pipeline', categorical_pipeline ),         
#                                                   ( 'numerical_pipeline', numerical_pipeline ) 
#                                                  ])
# xgb_pipeline = Pipeline(steps=[\
#                                 ('data_wrangling', cat_num_pipeline),
#                                 ('xgb_dmatrix', XGBTransformer())
#                                ])
# xgb_pipeline.fit(X, y)
# # help(xgb_pipeline.transform(X_train))