In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Project structure

- input
    - [x] train.csv
    - [x] test.csv
- src
    - [ ] create_folds.py
    - [ ] train.py
    - [ ] inference.py
    - [ ] models.py
    - [ ] config.py
    - [ ] model_dispatcher.py
- models
    - [ ] model_rf.bin
	- [ ] model_et.bin
- notebooks
	- [ ] exploration.ipynb
	- [ ] check_data.ipynb
-  [ ] readme.md
-  [ ] license

In [None]:
# config vars

csv_train = '../input/mnist-in-csv/mnist_train.csv'
csv_test = '../input/mnist-in-csv/mnist_test.csv'

csv_output = "mnist_train_folds.csv"

In [None]:
df = pd.read_csv(csv_train)
#df.info(verbose=True)


In [None]:
df.label.value_counts()

In [None]:
# src/create_folds.py
#creating stratified kfold by target

from sklearn import model_selection

if __name__ =="__main__":
    # load train data
    df = pd.read_csv(csv_train)
    
    #create kfold column filed with -1
    df['kfold']=-1
    
    #randomizew rows of data
    df = df.sample(frac=1).reset_index(drop=True)
    
    #fetch targets --> changed for mnist dataset target
    y = df.label.values
    
    #initiate kfold class from model selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    #fill the new kfold column
    for f,(t_,v_) in enumerate(kf.split(X=df, y =y)):
        df.loc[v_,'kfold'] = f
        
    #save new csv with kfold colun
    df.to_csv(csv_output, index=False)



In [None]:
# src/train.py version p76
# other versions were shown:
# p79 has config.py file implemented
# p80 has an argumnet parser to run folds from the command line 
#hardcoded: input file, target, fold numbers (5)

import joblib
import pandas as pd
from sklearn import metrics
from sklearn import tree

def run(fold):
# read the training data with folds
    df = pd.read_csv("./mnist_train_folds.csv")
    
# training data is where kfold is not equal to provided fold
# also, note that we reset the index
    df_train = df[df.kfold != fold].reset_index(drop=True)

# validation data is where kfold is equal to provided fold
    df_valid = df[df.kfold == fold].reset_index(drop=True)

# drop the label column from dataframe and convert it to
# a numpy array by using .values.
# target is label column in the dataframe
    x_train = df_train.drop("label", axis=1).values
    y_train = df_train.label.values
# similarly, for validation, we have
    x_valid = df_valid.drop("label", axis=1).values
    y_valid = df_valid.label.values
# initialize simple decision tree classifier from sklearn
    clf = tree.DecisionTreeClassifier()
# fir the model on training data
    clf.fit(x_train, y_train)
# create predictions for validation samples
    preds = clf.predict(x_valid)
# calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid, preds) 
    print(f"Fold={fold}, Accuracy={accuracy}")

# save the model
    #joblib.dump(clf, f"../models/dt_{fold}.bin")
    joblib.dump(clf, f"dt_{fold}.bin")

if __name__ == "__main__":
    #run(fold=input('input fold: '))
	run(fold=0)
	run(fold=1)
	run(fold=2)
	run(fold=3)
	run(fold=4)

In [None]:
# src/model_dispatcher.py
from sklearn import tree

models = {
    "decision_tree_gini": tree.DecisionTreeClassifier(criterion='gini'),
    "decision_tree_entropy": tree.DecisionTreeClassifier(criterion='entropy')
}

In [None]:
# train.py v p82 --> implementing model_dispatcher
# adapted for use in jupytter notebook

import argparse
import os
import joblib
import pandas as pd

from sklearn import metrics

# import config
# import model_dispatcher

def run(fold, model):
    # read the training data with folds
    df = pd.read_csv(csv_output)
    
    # training data is where kfold is not equal to provided fold
    # also, note that we reset the index
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # validation data is where kfold is equal to provided fold
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # drop the label column from dataframe and convert it to
    # a numpy array by using .values.
    # target is label column in the dataframe
    x_train = df_train.drop("label", axis=1).values
    y_train = df_train.label.values
    
    # similarly, for validation, we have
    x_valid = df_valid.drop("label", axis=1).values
    y_valid = df_valid.label.values
    
    # fetch the model from model_dispatcher
    clf = models[model]
    # fit the model on training data
    
    clf.fit(x_train, y_train)
    
    # create predictions for validation samples
    preds = clf.predict(x_valid)
    
    # calculate & print accuracy
    accuracy = metrics.accuracy_score(y_valid, preds)
    print(f"Fold={fold}, Accuracy={accuracy}")
    
    # save the model
    joblib.dump(clf, f"c5_dt_{fold}_{model}.bin")
#     joblib.dump(
#         clf,
#         #os.path.join(config.MODEL_OUTPUT, f"dt_{fold}.bin")
#         )

if __name__ == "__main__":
# modified for commit:
    fold = 1
    model_c = '2'
    
# modified with input():
#     fold_c = input('type fold number 0-4:')
#     fold = int(fold_c)
#     model_c = input('type model #: \n1 for gini\n2 for entropy:')
#     model =''
        
    if model_c == '1':
        model = 'decision_tree_gini'
    elif model_c == '2':
        model = 'decision_tree_entropy'

# original parsed version:
#         parser = argparse.ArgumentParser()
        
#         parser.add_argument(
#             "--fold",
#             type=int
#         )

#         parser.add_argument(
#             "--model",
#             type=str
#         )

#         args = parser.parse_args()
    print(f'\nChosen parameters: \nFold:{fold}\nModel: {model}')
    run(
        fold = fold,
        model = model
#        fold=args.fold,
#        model=args.model
        )



### Chapter 6 - categorical variables

using cat-in-the-dat from Categorical
Features Encoding Challenge from Kaggle
the 2nd challenge will be used

In [None]:
df = pd.read_csv('../input/cat-in-the-dat-ii/train.csv', index_col='id')

In [None]:
# thgis dataset has Nominal , Ordinal , Cyclical and Binary Categorical vars

df

In [None]:
# check target skewness
df.target.value_counts().plot(kind='bar')

In [None]:
# start EDA
df.info()

In [None]:
#check categori8es count for each column

for col in df.columns:
    print(df[col].value_counts())

In [None]:
#are there NaN  in columns? which percentage?
isnan = df.isna().sum()
# isnan = isnan /60000 *100
isnan

In [None]:
# mapping some categories using a number map
# this is Label Encoding
#same as in sklearn.preprocessing.LabelEnconder()

# example: Name: ord_2, dtype: int64
# Freezing       142726
# Warm           124239
# Cold            97822
# Boiling Hot     84790
# Hot             67508
# Lava Hot        64840

ord2_mapping = {
    "Freezing": 0,
     "Warm": 1,
     "Cold": 2,
     "Boiling Hot": 3,
     "Hot": 4,
     "Lava Hot": 5
}

# convert ord_2 according to the given dictionary:

# df.loc[:, "ord_2"] = df.ord_2.map("ord2_mapping")
df.ord_2 = df.ord_2.map(ord2_mapping)

In [None]:
df.ord_2.value_counts()

In [None]:
# using LabelEncoder()

import pandas as pd
from sklearn import preprocessing

# read the data
df = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")

# fill NaN values in ord_2 column
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# it sill label according to alp´habetical order
# P.S: do not use this directly. -->  fit first, then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)


In [None]:
df.ord_2.value_counts()

In [None]:
# using sparse format on an ordinal binarized feature
# using scipy's sparse method

import numpy as np
from scipy import sparse
# create our example feature matrix
example = np.array(
 [
 [0, 0, 1],
 [1, 0, 0],
 [1, 0, 1]
 ]
)
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# compare normal and sparse info representation
print(f"\n\n Normal array: {example.nbytes}\n",
 f"Sparse data: {sparse_example.data.nbytes}\n\n")

# the real size of sparse format is this:

print("sparse data total size: ",
 sparse_example.data.nbytes +
 sparse_example.indptr.nbytes +
 sparse_example.indices.nbytes
)

In [None]:
# sparse format makes more difference when there are lots of info
#this cell must run alone or will brake memory usage

import numpy as np
from scipy import sparse

# number of rows
n_rows = 10000

# number of columns
n_cols = 100000

# create random binary matrix with only 5% values as 1s
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
 sparse_example.data.nbytes +
 sparse_example.indptr.nbytes +
 sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")


In [None]:
# research other ways of representing a sparse matrix



In [None]:
# One Hot Encoding may take even less memory

import numpy as np
from scipy import sparse

# create binary matrix
example = np.array(
 [
 [0, 0, 0, 0, 1, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0]
 ]
)

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
 sparse_example.data.nbytes +
 sparse_example.indptr.nbytes +
 sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

In [None]:
print(sparse_example[:10])

In [None]:
#testing OHE with feature array having 1001 categories

import numpy as np
from sklearn import preprocessing

# create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe1 = preprocessing.OneHotEncoder(sparse=False)

# fit and transform data with dense one hot encoder
ohe1_example = ohe1.fit_transform(example.reshape(-1, 1))

# print size in bytes for dense array
print(f"Size of dense array: {ohe1_example.nbytes}")

# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get sparse array
ohe2 = preprocessing.OneHotEncoder(sparse=True)

# fit and transform data with sparse one-hot encoder
ohe2_example = ohe2.fit_transform(example.reshape(-1, 1))

# print size of this sparse matrix
print(f"Size of sparse array: {ohe2_example.data.nbytes}")

full_size = (
    ohe2_example.data.nbytes +
    ohe2_example.indptr.nbytes + ohe2_example.indices.nbytes
    )
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

In [None]:
# Check out One hot encode sparsed format
print(ohe2_example[0:10])

#### THE 3 METHODS to handle categorical variables

 - map em all according to a map dictionary and apply map()
 - conver to decimal and then to binary encode
 - convert to One hot encoder


In [None]:
df.ord_2

In [None]:
# how to check numbers of spcific categories?

df[df.ord_2 == "Boiling Hot"]

In [None]:
df.groupby(["ord_2"])["id"].count()

In [None]:
#replace categorical with its count , using transform

df.groupby(["ord_2"])["id"].transform("count")

In [None]:
# join 2 categories and give its appropriate count code
# e se tiver 2 categories com mesmo count?

df.groupby(
    [
        "ord_1",
        "ord_2"
    ])["id"].count().reset_index(name="count")

In [None]:
# create new features:
# NaN's will be joined altogether

df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
    )

df.new_feature

In [None]:
df.isna().sum()

In [None]:
df.ord_3.fillna('NONE').value_counts()

In [None]:
# # tyhis trick is to concatenate and incorporate feature categories 
# found in test data and not in train data
# use fit transform in the final result
# this trick wont work in live production
# only qwhen there is a train test

import pandas as pd
from sklearn import preprocessing

# read training data
train = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")

#read test data
test = pd.read_csv("../input/cat-in-the-dat-ii/test.csv")

# create a fake target column for test data
# since this column doesn't exist
test.loc[:, "target"] = -1

# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)


# make a list of features we are interested in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id", "target"]]

# loop over the features list
for feat in features:
# create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()

 # note the trick here
 # since its categorical data, we fillna with a string
 # and we convert all the data to string type
 # so, no matter its int or float, its converted to string
 # int/float but categorical!!!
    temp_col = data[feat].fillna("NONE").astype(str).values
 # we can use fit_transform here as we do not
 # have any extra test data that we need to
 # transform on separately
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)

# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [None]:
train

In [None]:
# example of RARE attribution

df.ord_4 = df.ord_4.fillna("NONE")
df.loc[
    df["ord_4"].value_counts()[df["ord_4"]].values < 2000,"ord_4"] = "RARE"

In [None]:
df.ord_4.value_counts()

In [None]:
# create_folds.py
# import pandas and model_selection module of scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
# Read training data
    df = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")
# we create a new column called kfold and fill it with -1
    df["kfold"] = -1

# the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

# fetch labels
    y = df.target.values

# initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f

# save the new csv with kfold column
    df.to_csv("cat_train_folds.csv", index=False)

In [None]:
#check if folds are ok 
df = pd.read_csv("./cat_train_folds.csv")
df.kfold.value_counts()

In [None]:
# check if targets are well distributed, for skewness

for i in range(0,4):
    print(df[df.kfold==i].target.value_counts())

In [None]:
# ohe_logres.py
# this model will hot encode data 
# and use logistic regression function 
# that splits data into training and validation, 
# given a fold number, 
# handles NaN values, 
# applies one-hot encoding on all the data 
# and trains a simple Logistic Regression model

import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
# load the full training data with folds
    df = pd.read_csv("./cat_train_folds.csv")

# all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesn’t matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # initialize OneHotEncoder from scikit-learn

    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation features

    full_data = pd.concat(
    [df_train[features], df_valid[features]],
    axis=0
    )


    ohe.fit(full_data[features])

# transform training data

    x_train = ohe.transform(df_train[features])
# transform validation data

    x_valid = ohe.transform(df_valid[features])

# initialize Logistic Regression model
    model = linear_model.LogisticRegression()

# fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
 # predict on validation data
 # we need the probability values as we are calculating AUC
 # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
 # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
 # print auc
    print(f"Fold = {fold}, AUC = {auc}")
    

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)


In [None]:
# lbl_rf.py
# this code applies random forest, by using Label Encoder(), not OHE

import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
# load the full training data with folds
    df = pd.read_csv("./cat_train_folds.csv")
# all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
        ]
# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

# now its time to label encode the features
    for col in features:

# initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()

 # fit label encoder on all data
        lbl.fit(df[col])
 # transform all the data
        df.loc[:, col] = lbl.transform(df[col])

# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
# get training data
    x_train = df_train[features].values
# get validation data
    x_valid = df_valid[features].values
# initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
# fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
# predict on validation data
# we need the probability values as we are calculating AUC
# we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
# get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
# print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

In [None]:
# ohe_svd_rf.py
# random forest model using OHE and Single Vlaue decomposition (SVD, 
# to reduce OHE matrices
# inference in Random forest is more time consumin g and takes larger space
# HOT encode FULL DATA
# fit sklearns TruncatedSVD on sparse matrix, on train + valid data
# --> reduces high dimensional sparse matrix to 120 feats before fitting random forest
import pandas as pd
from scipy import sparse
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
# load the full training data with folds
    df = pd.read_csv("./cat_train_folds.csv")
# all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
        ]
# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

# initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
# fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]],
        axis=0
        )
    ohe.fit(full_data[features])
# transform training data
    x_train = ohe.transform(df_train[features])
# transform validation data
    x_valid = ohe.transform(df_valid[features])
# initialize Truncated SVD
# we are reducing the data to 120 components
    svd = decomposition.TruncatedSVD(n_components=120)
# fit svd on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)
# transform sparse training data
    x_train = svd.transform(x_train)
# transform sparse validation data
    x_valid = svd.transform(x_valid)
# initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
# fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
# predict on validation data
# we need the probability values as we are calculating AUC
# we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
# get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
# print auc
    print(f"Fold = {fold}, AUC = {auc}")


In [None]:
# this process is very time consumind will be turned off

# if __name__ == "__main__":
#     for fold_ in range(5):
#         run(fold_)

In [None]:
# lbl_xgb.py
# implementing XGBoost (Gradient Boost)tre based model using LabelEncoder
import pandas as pd

import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
# load the full training data with folds
    df = pd.read_csv("./cat_train_folds.csv")
# all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
        ]
# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

# now it’s time to label encode the features
    for col in features:

# initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()

# fit label encoder on all data
        lbl.fit(df[col])
# transform all the data
        df.loc[:, col] = lbl.transform(df[col])
# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
# get training data
    x_train = df_train[features].values
# get validation data
    x_valid = df_valid[features].values
# initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200
        )
# fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
# predict on validation data
# we need the probability values as we are calculating AUC
# we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
# get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
# print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [None]:
# #this run will also be  turned off for being very time consuming 

# if __name__ == "__main__":
#     for fold_ in range(5):
#         run(fold_)

In [None]:
#lets work another good categorical rich dataset
#us data census from UCI

# df = pd.read_csv('./adult.csv')

In [None]:
# exercise: make the cross validataion

# src/create_folds.py
# creating stratified kfold by target

from sklearn import model_selection

if __name__ =="__main__":
    # load train data
    df = pd.read_csv('../input/adult-census-income/adult.csv')
    
    #create kfold column filed with -1
    df['kfold']=-1
    
    #randomizew rows of data
    df = df.sample(frac=1).reset_index(drop=True)
    
    #fetch targets --> changed for mnist dataset target
    y = df.income.values
    
    #initiate kfold class from model selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    #fill the new kfold column
    for f,(t_,v_) in enumerate(kf.split(X=df, y =y)):
        df.loc[v_,'kfold'] = f
        
    #save new csv with kfold colun
    df.to_csv('us_census_folds.csv', index=False)

In [None]:
# ohe_logres.py
import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
 # load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
    
 # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
        ]
    
     # !!!! drop numerical columns
    df = df.drop(num_cols, axis=1)

    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
# all columns are features except income and kfold columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]
# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
# initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
# fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]],
        axis=0
        )
    ohe.fit(full_data[features])

    # transform training data
    x_train = ohe.transform(df_train[features])
# transform validation data
    x_valid = ohe.transform(df_valid[features])

    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
# fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
# predict on validation data
# we need the probability values as we are calculating AUC
# we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
# print auc
    print(f"Fold = {fold}, AUC = {auc}")


In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

In [None]:
# trying the label encoded XGBoost without hyperparameter tuning

# lbl_xgb.py
import pandas as pd
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):
# load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
# list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
        ]
 # drop numerical columns
    df = df.drop(num_cols, axis=1)
 # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
        }
    df.loc[:, "income"] = df.income.map(target_mapping)
 # all columns are features except kfold & income columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]

# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

# now its time to label encode the features
    for col in features:
# initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
# fit label encoder on all data
        lbl.fit(df[col])
# transform all the data
        df.loc[:, col] = lbl.transform(df[col])
# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
# get training data
    x_train = df_train[features].values
# get validation data
    x_valid = df_valid[features].values
# initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1
        )
# fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
# predict on validation data
# we need the probability values as we are calculating AUC
# we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
# get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
# print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

In [None]:
#check if last model improves when max_depth =7 and n_estimators =200
def run(fold):
# load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
# list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
        ]
 # drop numerical columns
    df = df.drop(num_cols, axis=1)
 # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
        }
    df.loc[:, "income"] = df.income.map(target_mapping)
 # all columns are features except kfold & income columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]

# fill all NaN values with NONE
# note that I am converting all columns to "strings"
# it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

# now its time to label encode the features
    for col in features:
# initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
# fit label encoder on all data
        lbl.fit(df[col])
# transform all the data
        df.loc[:, col] = lbl.transform(df[col])
# get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
# get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
# get training data
    x_train = df_train[features].values
# get validation data
    x_valid = df_valid[features].values
# initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        n_estimators =200,
        max_depth = 7
        )
# fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
# predict on validation data
# we need the probability values as we are calculating AUC
# we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
# get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
# print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

In [None]:
# now , try XGBoost including numerical features 
# without parameter tuning p127

# lbl_xgb_num.py
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
 # load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
# list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
        ]
 # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
        }
    df.loc[:, "income"] = df.income.map(target_mapping)
 # all columns are features except kfold & income columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesnt matter because all are categories
    for col in features:
 # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

 # now its time to label encode the features
    for col in features:
        if col not in num_cols:
 # initialize LabelEncoder for each feature column
            lbl = preprocessing.LabelEncoder()

 # fit label encoder on all data
            lbl.fit(df[col])
 # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
 # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
 # get training data
    x_train = df_train[features].values
 # get validation data
    x_valid = df_valid[features].values
 # initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1
        )
 # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
 # predict on validation data
 # we need the probability values as we are calculating AUC
 # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
 # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
 # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)


In [None]:
#label encoded categ features + numerical features + feat engineering
# + XGBoost
# lbl_xgb_num_feat.py p130

import itertools
import pandas as pd
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def feature_engineering(df, cat_cols):
    """
    This function is used for feature engineering
    :param df: the pandas dataframe with train/test data
    :param cat_cols: list of categorical columns
    :return: dataframe with new features
    """
 # this will create all 2-combinations of values
 # in this list
 # for example:
 # list(itertools.combinations([1,2,3], 2)) will return
 # [(1, 2), (1, 3), (2, 3)]
    combi = list(itertools.combinations(cat_cols, 2))
    for c1, c2 in combi:
        df.loc[
            :,
            c1 + "_" + c2
            ] = df[c1].astype(str) + "_" + df[c2].astype(str)
    return df

def run(fold):
 # load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
 # list of numerical columns
    num_cols = [
         "fnlwgt",
         "age",
         "capital.gain",
         "capital.loss",
         "hours.per.week"
         ]
 # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
         }
    df.loc[:, "income"] = df.income.map(target_mapping)
 # list of categorical columns for feature engineering
    cat_cols = [
        c for c in df.columns if c not in num_cols
        and c not in ("kfold", "income")
        ]
 # add new features
    df = feature_engineering(df, cat_cols)
 # all columns are features except kfold & income columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesnt matter because all are categories
    for col in features:
 # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

 # now its time to label encode the features
    for col in features:
        if col not in num_cols:
 # initialize LabelEncoder for each feature column
            lbl = preprocessing.LabelEncoder()

 # fit label encoder on all data
            lbl.fit(df[col])
 # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
 # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
# get training data
    x_train = df_train[features].values
 # get validation data
    x_valid = df_valid[features].values
 # initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1
        )
 # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
 # predict on validation data
 # we need the probability values as we are calculating AUC
 # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
 # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
 # print auc
    print(f"Fold = {fold}, AUC = {auc}")


In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

In [None]:
# run the same last model 
# but tune hyperparam max_depth to 7

import itertools
import pandas as pd
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def feature_engineering(df, cat_cols):
    """
    This function is used for feature engineering
    :param df: the pandas dataframe with train/test data
    :param cat_cols: list of categorical columns
    :return: dataframe with new features
    """
 # this will create all 2-combinations of values
 # in this list
 # for example:
 # list(itertools.combinations([1,2,3], 2)) will return
 # [(1, 2), (1, 3), (2, 3)]
    combi = list(itertools.combinations(cat_cols, 2))
    for c1, c2 in combi:
        df.loc[
            :,
            c1 + "_" + c2
            ] = df[c1].astype(str) + "_" + df[c2].astype(str)
    return df

def run(fold):
 # load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
 # list of numerical columns
    num_cols = [
         "fnlwgt",
         "age",
         "capital.gain",
         "capital.loss",
         "hours.per.week"
         ]
 # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
         }
    df.loc[:, "income"] = df.income.map(target_mapping)
 # list of categorical columns for feature engineering
    cat_cols = [
        c for c in df.columns if c not in num_cols
        and c not in ("kfold", "income")
        ]
 # add new features
    df = feature_engineering(df, cat_cols)
 # all columns are features except kfold & income columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesnt matter because all are categories
    for col in features:
 # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

 # now its time to label encode the features
    for col in features:
        if col not in num_cols:
 # initialize LabelEncoder for each feature column
            lbl = preprocessing.LabelEncoder()

 # fit label encoder on all data
            lbl.fit(df[col])
 # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
 # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
# get training data
    x_train = df_train[features].values
 # get validation data
    x_valid = df_valid[features].values
 # initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth = 7
        )
 # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
 # predict on validation data
 # we need the probability values as we are calculating AUC
 # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
 # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
 # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [None]:
if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)

In [None]:
# you can also encode RARE values 
# binary featuyres
# combine OHE and Label
# and several other methods

In [None]:
#another optyion is to use target encoding

# target_encoding.py
import copy
import pandas as pd

from sklearn import metrics
from sklearn import preprocessing
import xgboost as xgb

def mean_target_encoding(data):
# make a copy of dataframe
    df = copy.deepcopy(data)
# list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
 # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    
    df.loc[:, "income"] = df.income.map(target_mapping)

# all columns are features except income and kfold columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        and f not in num_cols
        ]
 # all columns are features except kfold & income columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
        ]
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesnt matter because all are categories
    for col in features:
 # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

# now its time to label encode the features
    for col in features:
        if col not in num_cols:
 # initialize LabelEncoder for each feature column
            lbl = preprocessing.LabelEncoder()

 # fit label encoder on all data
            lbl.fit(df[col])
# transform all the data
            df.loc[:, col] = lbl.transform(df[col])
 # a list to store 5 validation dataframes
    encoded_dfs = []
 # go over all folds
    for fold in range(5):
 # fetch training and validation data
        df_train = df[df.kfold != fold].reset_index(drop=True)
        df_valid = df[df.kfold == fold].reset_index(drop=True)
 # for all feature columns, i.e. categorical columns
        for column in features:
 # create dict of category:mean target
            mapping_dict = dict(
                df_train.groupby(column)["income"].mean()
            )
 # column_enc is the new column we have with mean encoding
            df_valid.loc[
                :, column + "_enc"
            ] = df_valid[column].map(mapping_dict)
 # append to our list of encoded validation dataframes
        encoded_dfs.append(df_valid)
 # create full data frame again and return
    encoded_df = pd.concat(encoded_dfs, axis=0)
    return encoded_df

def run(df, fold):
 # note that folds are same as before
 # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
 # all columns are features except income and kfold columns
    features = [
        f for f in df.columns if f not in ("kfold", "income")
    ]
 # scale training data
    x_train = df_train[features].values
 # scale validation data
    x_valid = df_valid[features].values
 # initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7
        )
 # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
 # predict on validation data
 # we need the probability values as we are calculating AUC
 # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
 # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
 # print auc
    print(f"Fold = {fold}, AUC = {auc}")


In [None]:
if __name__ == "__main__":
 # read data
    df = pd.read_csv("./us_census_folds.csv")

 # create mean target encoded categories and
 # munge data
    df = mean_target_encoding(df)
 # run training and validation for 5 folds
    for fold_ in range(5):
        run(df, fold_)

In [None]:

# # entity_embeddings.py p138
# using on the cat train dataset

import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers

from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

def create_model(data, catcols):
    """
    This function returns a compiled tf.keras model
    for entity embeddings
    :param data: this is a pandas dataframe
    :param catcols: list of categorical column names
    :return: compiled tf.keras model
    """
 # init list of inputs for embeddings
    inputs = []
 # init list of outputs for embeddings
    outputs = []
 # loop over all categorical columns
    for c in catcols:
 # find the number of unique values in the column
        num_unique_values = int(data[c].nunique())
    
 # simple dimension of embedding calculator
 # min size is half of the number of unique values
 # max size is 50. max size depends on the number of unique
 # categories too. 50 is quite sufficient most of the times
 # but if you have millions of unique values, you might need
 # a larger dimension
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
 # simple keras input layer with size 1
        inp = layers.Input(shape=(1,))
 # add embedding layer to raw input
 # embedding size is always 1 more than unique values in input
        out = layers.Embedding(
            num_unique_values + 1, embed_dim, name=c
        )(inp)

 # 1-d spatial dropout is the standard for emebedding layers
 # you can use it in NLP tasks too
        out = layers.SpatialDropout1D(0.3)(out)
 # reshape the input to the dimension of embedding
 # this becomes our output layer for current feature
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
 # add input to input list
        inputs.append(inp)
 # add output to output list
        outputs.append(out)

 # concatenate all output layers
    x = layers.Concatenate()(outputs)
 # add a batchnorm layer.
 # from here, everything is up to you
 # you can try different architectures
 # this is the architecture I like to use
 # if you have numerical features, you should add
 # them here or in concatenate layer
    x = layers.BatchNormalization()(x)

 # a bunch of dense layers with dropout.
 # start with 1 or two layers only
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
 # using softmax and treating it as a two class problem
 # you can also use sigmoid, then you need to use only one
 # output class
    y = layers.Dense(2, activation="softmax")(x)
 # create final model
    model = Model(inputs=inputs, outputs=y)
 # compile the model
 # we use adam and binary cross entropy.
 # feel free to use something else and see how model behaves
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def run(fold):
 # load the full training data with folds
    df = pd.read_csv("./cat_train_folds.csv")
 # all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
        ]
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
 # encode all features with label encoder individually
 # in a live setting you need to save all label encoders
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)
 # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
 # create tf.keras model
    model = create_model(df, features)
 # our features are lists of lists
    xtrain = [
        df_train[features].values[:, k] for k in range(len(features))
        ]
    xvalid = [
        df_valid[features].values[:, k] for k in range(len(features))
        ]
 # fetch target columns
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
 # convert target columns to categories
# this is just binarization
    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)

 # fit the model
    model.fit(xtrain,
        ytrain_cat,
        validation_data=(xvalid, yvalid_cat),
        verbose=1,
        batch_size=1024,
        epochs=3
        )
 # generate validation predictions
    valid_preds = model.predict(xvalid)[:, 1]
 # print roc auc score
    print(metrics.roc_auc_score(yvalid, valid_preds))
 # clear session to free up some GPU memory
    K.clear_session()



In [None]:
if __name__ == "__main__":
    run(0)
    run(1)
    run(2)
    run(3)
    run(4)

In [None]:
df = pd.read_csv("./us_census_folds.csv")
df.head().T

In [None]:
# run TF KERAS entity embedding on us census data

def run(fold):
 # load the full training data with folds
    df = pd.read_csv("./us_census_folds.csv")
 # all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ("id", "income", "kfold")
        ]
    
     # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    
    df.loc[:, "income"] = df.income.map(target_mapping)
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
 # encode all features with label encoder individually
 # in a live setting you need to save all label encoders
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)
 # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
 # create tf.keras model
    model = create_model(df, features)
 # our features are lists of lists
    xtrain = [
        df_train[features].values[:, k] for k in range(len(features))
        ]
    xvalid = [
        df_valid[features].values[:, k] for k in range(len(features))
        ]
 # fetch target columns
    ytrain = df_train.income.values
    yvalid = df_valid.income.values
 # convert target columns to categories
# this is just binarization
    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)

 # fit the model
    model.fit(xtrain,
        ytrain_cat,
        validation_data=(xvalid, yvalid_cat),
        verbose=1,
        batch_size=1024,
        epochs=3
        )
 # generate validation predictions
    valid_preds = model.predict(xvalid)[:, 1]
 # print roc auc score
    print(metrics.roc_auc_score(yvalid, valid_preds))
 # clear session to free up some GPU memory
    K.clear_session()

In [None]:
if __name__ == "__main__":
    run(0)
    run(1)
    run(2)
    run(3)
    run(4)