In [None]:
import time
import gc

import skopt
import mlflow
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import roc_auc_score, make_scorer, f1_score, recall_score, precision_score, balanced_accuracy_score, classification_report
from sklearn import preprocessing


SPACE = [skopt.space.Real(0.005, 0.1, name='learning_rate', prior='log-uniform'),
         skopt.space.Integer(3, 100, name='max_depth'),
         skopt.space.Real(0.1, 1, name='colsample_bytree', prior='uniform'),
         skopt.space.Real(0.5, 1.0, name='subsample', prior='uniform')]


STATIC_PARAMS = {"objective": "binary:logistic",
                #  "metric": "custom",
                 "n_estimators": 5000,
                 "random_state": 314,
                 "eval_metric": "auc"
                }


def read_data():
    train = pd.read_csv('input/train_reduced.csv')
    test = pd.read_csv('input/test_reduced.csv')
    sub = pd.read_csv('input/sample_submission.csv.zip')
    return train, test, sub


def preprocess(train, test):
    train = train.drop(['ID', 'target'], axis=1)
    test = test.drop('ID', axis=1)

    train = train.dropna(axis=1, thresh=2000)
    test = test.dropna(axis=1, thresh=2000)

    train = train.fillna(-1)
    test = test.fillna(-1)
    train, test = _label_encode(train, test)
    return train, test

def _label_encode(train, test):
    for f in train.columns:
        if train[f].dtype=='object': 
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train[f].values) + list(test[f].values))
            train[f] = lbl.transform(list(train[f].values))
            test[f] = lbl.transform(list(test[f].values))
    return train, test

def train_evaluate(all_params):
    # Threshold to determine whether or not to save the f1 score obtained
    AUC_SCORE_THRESHOLD = 0.79
    start_time = time.time()
    model = XGBClassifier(**all_params)
    model.fit(X_train, y_train, early_stopping_rounds=7, eval_set=[(X_train, y_train), (X_val, y_val)])
    # print("Yes")
    evals_result = model.evals_result()
    # print(evals_result)
    train_auc = evals_result["validation_0"]["auc"][-1]
    val_auc = evals_result["validation_1"]["auc"][-1]
    target_run_time = time.time() - start_time
    # report = classification_report(y_val, model.predict(X_val))
    print(f"Took {target_run_time} seconds to optimize")
    print(all_params)
    if val_auc > AUC_SCORE_THRESHOLD:
        # print(lgbm.best_score_)
        with mlflow.start_run():
            print(all_params)
            print(f"Train auc score: {train_auc}")
            print(f"Val auc score: {val_auc}")
            print("-------------------")
            
            # params_dict, best_gini = make_params_to_log(results)
            mlflow.log_params(all_params)
            mlflow.log_metric(f"train_auc", train_auc)
            mlflow.log_metric(f"val_auc", val_auc)
            mlflow.log_metric(f"target_run_time", target_run_time)
    return val_auc

def custom_f1_score(threshold):
    # https://stackoverflow.com/questions/63399806/how-to-pass-additional-parameters-to-lgbm-custom-loss-function
    def func(y_true, y_pred):
        # import pdb
        # pdb.set_trace()
        y_pred = (y_pred>=threshold).astype(int)
        report = classification_report(y_true, y_pred, output_dict=True)
        f1_score_0 = report['0.0']['f1-score']
        f1_score_1 = report['1.0']['f1-score']
        f_score = f1_score_0 * 0.33333 + f1_score_1 * 0.66666
        return "f1_score", f_score, True
    return func

def custom_recall_score(y_true, y_pred):
    num_classes = int(len(y_pred) / len(y_true))
    y_pred = y_pred.reshape(num_classes, -1).T
    # y_pred = y_pred.argmax(axis = 1)
    recall = recall_score(y_true, y_pred)
    # y_pred = np.apply_along_axis(self.le.inverse_transform, 1, y_pred)
    return "recall", recall, True

def custom_precision_score(y_true, y_pred):
    num_classes = int(len(y_pred) / len(y_true))
    y_pred = y_pred.reshape(num_classes, -1).T
    # y_pred = y_pred.argmax(axis = 1)
    precision = precision_score(y_true, y_pred)
    # y_pred = np.apply_along_axis(self.le.inverse_transform, 1, y_pred)
    return "precision", precision, True

def custom_balanced_score(y_true, y_pred):
    num_classes = int(len(y_pred) / len(y_true))
    y_pred = y_pred.reshape(num_classes, -1).T
    # y_pred = y_pred.argmax(axis = 1)
    balanced_score = balanced_accuracy_score(y_true, y_pred)
    # y_pred = np.apply_along_axis(self.le.inverse_transform, 1, y_pred)
    return "balanced_score", balanced_score, True

@skopt.utils.use_named_args(SPACE)
def objective(**params):
    all_params = {**params, **STATIC_PARAMS}
    return -1.0 * train_evaluate(all_params)


def make_params_to_log(results):
    params_names = ["learning_rate", "max_depth",
                    "colsample_bytree", "subsample"]
    params_dict = {}
    for i in range(len(params_names)):
        params_dict[params_names[i]] = results.x[i]
    best_f1 = results.fun

    return params_dict, best_f1


def make_train_val_sets(X_train, y_train):
    return train_test_split(X_train, y_train, train_size=0.75, test_size=0.25, stratify=y_train)

In [None]:
# Reduce df memory

def read_data():
    train = pd.read_csv('input/train.csv.zip')
    test = pd.read_csv('input/test.csv.zip')
    sub = pd.read_csv('input/sample_submission.csv.zip')
    return train, test, sub

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df


if __name__ == "__main__": 
    print("Starting loading data")
    train, test, sub = read_data()
    y = train.target.values
    print("Finished loading data")
    print("Start memory optimization")
    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)
    print("Finish memory optimization")
    
    train.to_csv("input/train_reduced.csv", index=False)
    test.to_csv("input/test_reduced.csv", index=False)

In [None]:
# Abishek parameters
if __name__ == "__main__":
    
    print("Starting loading data")
    train, test, sub = read_data()
    y = train.target.values
    print("Finished loading data")
    
    train, test = preprocess(train, test)

    # X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.33, random_state=42, stratify=y)
    
    # del train
    # del test
    # del X
    # del y
    # gc.collect()

In [None]:
clf = XGBClassifier(n_estimators=5000, nthread=-1, max_depth=17,
                        learning_rate=0.01, silent=False, subsample=0.8, colsample_bytree=0.7)
clf.fit(train, y)

preds = clf.predict_proba(test)[:,1]
sub.target = preds
sub.to_csv('sub.csv', index=False)

In [None]:
# Hyperparameter tuning
# if __name__ == "__main__":
    
#     print("Starting loading data")
#     train, test, sub = read_data()
#     y = train.target.values
#     print("Finished loading data")
    
#     train, test = preprocess(train, test)

#     X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.33, random_state=42, stratify=y)
    
#     del train
#     del test
#     # del X
#     del y
#     gc.collect()
    
#     experiment_name = f"XGB"
#     mlflow.set_experiment(experiment_name)

#     results = skopt.forest_minimize(objective, SPACE, n_calls=100, n_random_starts=50, random_state=42)
#     params_dict, best_auc_score = make_params_to_log(results)