In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import missingno as mn
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



In [2]:
#Normalizing function for columns
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        if feature_name!='application_key':# or feature_name!= "application_key" or feature_name!= "default_ind":
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [3]:
#Selecting specified number of columns with a certain NaN values

def get_column(df):
    feature_NA = []
    for feature_name in df.columns:
        if df[feature_name].isna().sum()>0.45*df.shape[0]:
            feature_NA.append(feature_name)
    return feature_NA

#Dropping Columns
def drop_column(df,feature_NA):
    df = df.drop(columns=feature_NA,axis=1)
    return df
    

In [4]:
def data_preprocess_spliting(dataset):
    #Converting dataset
    dataset = dataset.replace("L", 1, regex=True)
    dataset = dataset.replace("C", 0, regex=True)
    dataset = dataset.replace("na", np.nan, regex=True)
    dataset = dataset.replace("missing", np.nan, regex=True)
    dataset=dataset.astype('float64')
    
    #Spliting C and L from dataset
    dataC,dataL = [x for _, x in dataset.groupby(dataset['mvar47'] == 1.0)]
    return dataC,dataL

In [5]:
def data_preprocess_NA(dataC,dataL):
    
    #Removing Columns
    
    dataC= drop_column(dataC,featureC_NA_global)
    dataL= drop_column(dataL,featureL_NA_global)
    
    #Removing rows containing high percentage of NaN values
    
    dataC = dataC.dropna(axis=0,thresh=0.75*dataC.shape[1])
    dataL = dataL.dropna(axis=0,thresh=0.75*dataL.shape[1])
    
    #Normalizing data
    dataC= normalize(dataC)
    dataL= normalize(dataL)
    
    #imputing data
    dataC = dataC.fillna(dataC.median())
    dataL = dataL.fillna(dataL.median())
    
    return dataC,dataL

In [6]:
dataset = pd.read_csv("Training_dataset_Original.csv", low_memory=False)
dataset_lead = pd.read_csv("Leaderboard_dataset.csv", low_memory=False)

target = 'default_ind'
IDcol = 'application_key'


In [7]:
dataC, dataL = data_preprocess_spliting(dataset)
dataC_lead, dataL_lead = data_preprocess_spliting(dataset_lead)

featureC_NA_global = get_column(dataC)
featureL_NA_global = get_column(dataL)

dataC, dataL = data_preprocess_NA(dataC,dataL)
dataC_lead, dataL_lead = data_preprocess_NA(dataC_lead,dataL_lead)

In [8]:
print dataL.shape, dataC.shape, dataL_lead.shape, dataC_lead.shape

(24662, 42) (40546, 43) (7723, 41) (12752, 42)


In [9]:
X_dataL = dataL.values[:,1:dataL.shape[1]-2]
Y_dataL = dataL.values[:,dataL.shape[1]-1]
X_dataC = dataC.values[:,1:dataC.shape[1]-2]
Y_dataC = dataC.values[:,dataC.shape[1]-1]


X_keyL_lead = dataL_lead.values[:,0]
X_keyC_lead = dataC_lead.values[:,0]
X_dataL_lead = dataL_lead.values[:,1:dataL_lead.shape[1]-1]
X_dataC_lead = dataC_lead.values[:,1:dataC_lead.shape[1]-1]


In [10]:
print X_dataL.shape, X_dataC.shape, X_dataL_lead.shape, X_dataC_lead.shape

(24662, 39) (40546, 40) (7723, 39) (12752, 40)


In [11]:
#from sklearn.cross_validation import train_test_split
#X_dataL_train, X_dataL_test, Y_dataL_key1, Y_dataL_key2 = train_test_split( X_dataL, Y_dataL, test_size = 0.3, random_state = 18)
#X_dataC_train, X_dataC_test, Y_dataC_key1, Y_dataC_key2 = train_test_split( X_dataC, Y_dataC, test_size = 0.3, random_state = 18)

In [12]:
#Y_dataL_test = Y_dataL_key2[:,1]
#Y_dataC_test = Y_dataC_key2[:,1]
#Y_dataL_train = Y_dataL_key1[:,1]
#Y_dataC_train= Y_dataC_key1[:,1]

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=2, early_stopping_rounds=5):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=2,
            metrics='auc', early_stopping_rounds=5)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['application_key'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

#Choose all predictors except target & IDcols
predictors = [x for x in dataL.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
 learning_rate =0.2,
 n_estimators=100,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, dataL, predictors)

#Choose all predictors except target & IDcols
predictors = [x for x in dataC.columns if x not in [target, IDcol]]

xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, dataC, predictors)

In [18]:
from xgboost import XGBClassifier
params={
    'booster':'gbtree',
       'objective': 'binary:logistic',
    'n_estimators':1000
       'eval_metric': 'auc',
      'max_depth':4,
      'subsample':0.75,
      'colsample_bytree':0.75,
    'min_child_weight':2,
        'eta': 0.025,
        'seed':0,
        'nthread':8,
        'silent':1
       }
# fit model no training data
model_L = XGBClassifier(**params)
model_L.fit(X_dataL, Y_dataL)
# make predictions for test data
y_prob_L = model_L.predict_proba(X_dataL_lead)
y_pred_L = model_L.predict(X_dataL_lead)

predictions = [round(value) for value in y_pred_L]
# evaluate predictions
#accuracy = accuracy_score(Y_dataL_test, y_pred_L)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Event_Probability_L and DataFrame  creation
event_prob_L = np.amax(y_prob_L, axis=1)
d_L= {"application_key":X_keyL_lead, "y_pred" : y_pred_L,"event_prob" : event_prob_L}
df_L = pd.DataFrame(data=d_L)


SyntaxError: invalid syntax (<ipython-input-18-a6eb9153e730>, line 6)

In [19]:
#XBoost on C
params={
    'booster':'gbtree',
       'objective': 'binary:logistic',
       'eval_metric': 'auc',
        'n_estimators':1000
      'max_depth':4,
      'subsample':0.75,
      'colsample_bytree':0.75,
    'min_child_weight':2,
        'eta': 0.025,
        'seed':0,
        'nthread':8,
        'silent':1
       }
# fit model no training data
model_C = XGBClassifier(**params)
model_C.fit(X_dataC, Y_dataC)
# make predictions for test data
y_prob_C = model_C.predict_proba(X_dataC_lead)
y_pred_C = model_C.predict(X_dataC_lead)

predictions = [round(value) for value in y_pred_C]
# evaluate predictions
#accuracy = accuracy_score(Y_dataC_test, y_pred_C)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Event_Probability_C and DataFrame  creation
event_prob_C = np.amax(y_prob_C, axis=1)
d_C= {"application_key":X_keyC_lead, "y_pred" : y_pred_C,"event_prob" : event_prob_C}
#print X_keyC_lead, y_pred_C, event_prob_C
df_C = pd.DataFrame(data=d_C)

SyntaxError: invalid syntax (<ipython-input-19-f27cef57f5b8>, line 7)

In [20]:
#Concatenating C and L
frames = [df_C, df_L]
Final_result = pd.concat(frames)
Final_result

Unnamed: 0,application_key,event_prob,y_pred
0,350054.0,0.952037,0.0
1,350055.0,0.867250,0.0
2,350062.0,0.679058,0.0
3,350063.0,0.933493,0.0
4,350064.0,0.769867,0.0
5,350065.0,0.704683,1.0
6,350066.0,0.562293,1.0
7,350067.0,0.698252,0.0
8,350070.0,0.897706,0.0
9,350071.0,0.869619,0.0


In [21]:
Final_result.to_csv("dfghj.csv")