In [1]:
import torch
import numpy as np
from sklearn.model_selection import train_test_split 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb

In [4]:
INPUT_FILE_PATH = '/Users/sahanalva/Downloads/application_train.csv'
OUTPUT_MODEL_PATH = '/Users/sahanalva/Counterfactual Research/tree_model_test.dat'

In [5]:
# read the test files 
app_train = pd.read_csv(INPUT_FILE_PATH)

app_train['is_test'] = 0
app_train['is_train'] = 1

postive_df = app_train[app_train['TARGET'] == 1]
negative_df = app_train[app_train['TARGET'] == 0].sample(len(postive_df))
app_train = pd.concat([postive_df, negative_df], axis= 0)
app_train = app_train.sample(frac=1, random_state= 42).reset_index(drop=True)

# target variable
Y = app_train['TARGET']
train_X = app_train.drop(['TARGET'], axis = 1)
data = train_X

In [9]:
cats = ['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE', 
 'CNT_CHILDREN',
 'CNT_FAM_MEMBERS',
 'REG_REGION_NOT_LIVE_REGION']

nums= ['AMT_INCOME_TOTAL',
        'AMT_CREDIT',
        'AMT_ANNUITY',
        'EXT_SOURCE_1',
        'EXT_SOURCE_2',
        'EXT_SOURCE_3']
        'DAYS_LAST_PHONE_CHANGE',
        'AMT_GOODS_PRICE',
        'REGION_POPULATION_RELATIVE',
        'DAYS_BIRTH',
        'DAYS_EMPLOYED',
        'DAYS_REGISTRATION',
        'DAYS_ID_PUBLISH']

meta = ['SK_ID_CURR','is_train','is_test']

In [10]:
# function to obtain Categorical Features
def _get_categorical_features(df):
    feats = [col for col in list(df.columns) if df[col].dtype == 'object']
    return feats

def convert_to_string(df, discrete_columns):
    for col in discrete_columns:
        df[col] = df[col].astype(str)
    return df

# function to factorize categorical features
def _factorize_categoricals(df, cats):
    for col in cats:
        df[col], _ = pd.factorize(df[col])
    return df 

# function to create dummy variables of categorical features
def _get_dummies(df, cats):
    for col in cats:
        df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    return df 


data = data[meta+cats+nums]
data[nums] = data[nums].fillna(0)
data[cats] = data[cats].fillna("NA")
data = convert_to_string(data, cats)
data = _factorize_categoricals(data, cats)



In [35]:
ignore_features = ['SK_ID_CURR', 'is_train', 'is_test']
relevant_features = [col for col in data.columns if col not in ignore_features]
trainX = data[data['is_train'] == 1][relevant_features]

In [13]:
x_train, x_val, y_train, y_val = train_test_split(trainX, Y, test_size=0.2, random_state=18)


In [26]:
params = {'max_depth':7,'min_child_weight':1,'eval_metric':['error','auc'],'alpha':0.5, 'lambda':0.5, 'objective':'binary:logistic'}

xgb_train = xgb.DMatrix(data = trainX, label=Y)
evallist = [(xgb_train, 'eval'), (xgb_train, 'train')]

#xgb_train = xgb.DMatrix(data = x_train, label=y_train)
#xgb_eval = xgb.DMatrix(data=x_val, label=y_val)
#evallist = [(xgb_eval, 'eval'), (xgb_train, 'train')]

In [27]:
num_round = 100
bst = xgb.train(params, xgb_train, num_round,evallist)

[0]	eval-error:0.33156	eval-auc:0.73120	train-error:0.33156	train-auc:0.73120
[1]	eval-error:0.32220	eval-auc:0.74142	train-error:0.32220	train-auc:0.74142
[2]	eval-error:0.31722	eval-auc:0.74824	train-error:0.31722	train-auc:0.74824
[3]	eval-error:0.31484	eval-auc:0.75260	train-error:0.31484	train-auc:0.75260
[4]	eval-error:0.31335	eval-auc:0.75607	train-error:0.31335	train-auc:0.75607
[5]	eval-error:0.31001	eval-auc:0.75941	train-error:0.31001	train-auc:0.75941
[6]	eval-error:0.30596	eval-auc:0.76389	train-error:0.30596	train-auc:0.76389
[7]	eval-error:0.30338	eval-auc:0.76682	train-error:0.30338	train-auc:0.76682
[8]	eval-error:0.30046	eval-auc:0.77124	train-error:0.30046	train-auc:0.77124
[9]	eval-error:0.29823	eval-auc:0.77400	train-error:0.29823	train-auc:0.77400
[10]	eval-error:0.29635	eval-auc:0.77685	train-error:0.29635	train-auc:0.77685
[11]	eval-error:0.29501	eval-auc:0.77881	train-error:0.29501	train-auc:0.77881
[12]	eval-error:0.29386	eval-auc:0.78023	train-error:0.29386	t

In [None]:
pickle.dump(bst, open(OUTPUT_MODEL_PATH, "wb"))