## Ensure you are using right kernal

In [None]:
import sys
print(sys.executable)

#### Load the initial set of training and test data. Do a quick analysis and train the model for least possible accuracy

In [None]:
## Load the packages
import pandas as pd

data_dir = '/datasets/kaggle/competitions/home-credit-default-risk/'

In [None]:
train = pd.read_csv(data_dir + "/application_train.csv", header=0, index_col=0)
train.head()

In [None]:
test = pd.read_csv(data_dir + "/application_test.csv", header=0, index_col=0)
test.head()

## Few important notes when converting categorical values to numeric
- Test dataset may not contain all the categories that are present in the training dataset
- The categorical names to values map of training dataset shall match to that of test datasets (eg. if `cash loans` is mapped to `1`. It shall have the same value `1` in both test and train datasets

*Note: In order to achieve the above. We shall merge the training and test datasets and then transform the categorical data to numeric ones*

In [None]:
train_temp = train.drop(['TARGET'], axis = 1)
print("Train shape : {} \n Test shape : {}".format(train.shape, test.shape))

In [None]:
merge_df = pd.concat([train_temp, test])
print(merge_df.shape)

In [None]:
# Find categorical columns
categorical_cols = set(list(merge_df.columns)) - set(list(merge_df._get_numeric_data().columns))

print(categorical_cols)

In [None]:
# Convert the categorical columns to numeric categories
for col in categorical_cols:
    merge_df[col] = pd.Categorical(merge_df[col])
    merge_df[col] = merge_df[col].cat.codes
    
print(merge_df.columns)

In [None]:
# Separate train and test data sets from a merged dataframe
train_1 = merge_df.iloc[0:307511,:]
test_1 = merge_df.iloc[307511:,:]

In [None]:
print("Train shape {} \n Test shape {}".format(train_1.shape, test_1.shape))

In [None]:
## Create features and labesl
train_X = train_1
train_y = train['TARGET'].values
test_X = test_1.copy()

In [None]:
### Print the shape of arrays

print("Shape of training feature vector : {} \n  Shape of training target : {} \n Shape of test feature vector {}".format(train_X.shape, train_y.shape, test_X.shape))

In [None]:
## Do the train, dev data split up
from sklearn.cross_validation import train_test_split
train_X_1,dev_X,train_y_1,dev_y = train_test_split(train_X.values,train_y,test_size=0.2,random_state=123)

## Fit the data into XGBoost classifier

In [None]:
import xgboost as xgb
print(xgb.__version__)

In [None]:
import xgboost as xgb

ind_params = {
   # 'seed':27,
    'random_search_runs': 0,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    #'eta': 0.001,
    'max_leaves': 30,
    'max_depth': 5,
    'max_bin': 255,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'col_sample_bylevel': 1,
    'min_child_weight': 4,
    'lambda': 0.001,
    'alpha': 0.001,
   # 'scale_pos_weight': 1,
    'early_stopping_rounds': 1000,
    'n_estimators': 30000,
    'objective': 'binary:logistic',
    #'is_unbalance': True,
    #'n_estimators': 2673,
    #'num_leaves': 77,
    #'learning_rate': 0.007641070180129345,
    #'min_child_samples': 460,
    #'boosting_type': 'gbdt',
    #'subsample_for_bin': 240000,
    #'reg_lambda': 0.2040816326530612,
    #'reg_alpha': 0.8775510204081632,
    #'subsample': 0.9494949494949496,
    #'colsample_bytree': 0.7333333333333333,
   #'max_depth': 5,
   #'min_child_weight': 2,
   #'n_estimators': 100000,
   #'subsample': 0.7,
   'learning_rate': 0.01,
   #'nthread':6,
    #'gamma':1,
    # 'reg_alpha':0.005,
    # 'colsample_bytree':0.8,
    # 'scale_pos_weight':9,
    'gpu_id':1,
    #'reg_lambda':100.0,
    # 'colsample_bytree':0.8,
    'tree_method':'gpu_hist'
    #'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    #'scale_pos_weight':200 # because training data is extremely unbalanced

        }

clf = xgb.XGBClassifier(**ind_params)
model = clf.fit(train_X_1, train_y_1, eval_set=[(train_X_1, train_y_1), (dev_X, dev_y)], eval_metric='auc', early_stopping_rounds=1000, verbose=True)

In [None]:
## Load the test data, predict and save the predictions
prediction = pd.DataFrame()
test_ids  = test.index.values
prediction["SK_ID_CURR"] = test_ids

print("Predicting...")
prediction["TARGET"] = model.predict_proba(data=test_X.values)[:,1]
print("writing into file...")
prediction.to_csv("home_credit_submission_xgboost_2708_1.csv",index=False)
print("Done")