In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
import sklearn.ensemble as ske
import seaborn as sns
%matplotlib inline

In [19]:
# Removes a warning in sklearn that will be fixed during an update mid 2018
import warnings

if __name__ == '__main__':
    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    le = preprocessing.LabelEncoder()
    le.fit([1, 2, 2, 6])
    le.transform([1, 1, 2, 6])
    le.inverse_transform([0, 0, 1, 2])

In [20]:
df = pd.read_csv('./data/application_train.csv', sep=',', header=0)
print(df.info())
df.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [279]:
print("Number of columns with missing data: " + str(len(df.columns[df.isnull().any()].tolist())))

Number of columns with missing data: 67


In [6]:
def replace_nulls(df):
    
    columns = list(df.columns.values)
    
    for col in columns:
        # Replace numeric values with mean
        if (np.issubdtype(df[col], np.number)):
            df[col].fillna(df[col].mean(), inplace = True)
        
        # Replace categorical and boolean values with highest occuring value
        else:
            df[col].fillna(df[col].value_counts().index[0], inplace = True)
    return df

In [281]:
df = replace_nulls(df)
print("Number of columns with missing data: " + str(len(df.columns[df.isnull().any()].tolist())))

Number of columns with missing data: 0


In [7]:
def process_categorical(df):
    
    categorical_columns = ['NAME_CONTRACT_TYPE',
                           'CODE_GENDER',
                           'FLAG_OWN_CAR',
                           'FLAG_OWN_REALTY',
                           'NAME_TYPE_SUITE',
                           'NAME_INCOME_TYPE',
                           'NAME_EDUCATION_TYPE',
                           'NAME_FAMILY_STATUS',
                           'NAME_HOUSING_TYPE',
                           'OCCUPATION_TYPE',
                           'WEEKDAY_APPR_PROCESS_START',
                           'ORGANIZATION_TYPE',
                           'FONDKAPREMONT_MODE',
                           'HOUSETYPE_MODE',
                           'WALLSMATERIAL_MODE',
                           'EMERGENCYSTATE_MODE']
    
    # Significance of categories for target, will later try more with this
    #for col in categorical_columns:
    #   df.groupby(col).mean()['TARGET'].sort_values(ascending=False)


    df[categorical_columns] = df[categorical_columns].apply(preprocessing.LabelEncoder().fit_transform)
    
    return df

In [283]:
df = process_categorical(df)
df.head(1)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,1,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
def preprocess_data(df):
    df = replace_nulls(df)
    df = process_categorical(df)
    df = df.drop('SK_ID_CURR', axis=1)
    
    return df

In [22]:
df = pd.read_csv('./data/application_train.csv', sep=',', header=0)
df = preprocess_data(df)
print("Number of columns with missing data: " + str(len(df.columns[df.isnull().any()].tolist())))
df.head(1)

Number of columns with missing data: 0


Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
def split_data(df):
    df_train = df.sample(frac = 0.9, random_state = 42)
    df_val = df.drop(df_train.index)  
    
    X_train = df_train.drop(['TARGET'], axis=1).values
    y_train = df_train['TARGET'].values
    
    X_val = df_val.drop(['TARGET'], axis=1).values
    y_val = df_val['TARGET'].values
    
    return X_train, y_train, X_val, y_val

In [24]:
X_train, y_train, X_val, y_val = split_data(df)

In [8]:
def train_clf(clf, X_train, y_train, X_val, y_val):
    
    clf.fit (X_train, y_train)
    
    y_act, y_pred = y_val, clf.predict_proba(X_val)
    scores = cross_val_score(clf, X_val, y_val, cv=10, scoring='accuracy')
    print("Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    
    return clf

In [9]:
rf_clf = ske.RandomForestClassifier(n_estimators=50)
rf_clf = train_clf(rf_clf, X_train, y_train, X_val, y_val)

Validation Accuracy: 0.92 (+/- 0.00)


In [27]:
def train_lgbm_clf(clf, X_train, y_train, X_val, y_val):
    
    clf.fit(X_train, y_train, 
            eval_set= [(X_val, y_val)],
            eval_metric='auc', 
            verbose=250, 
            early_stopping_rounds=150
           )
    
    y_act, y_pred = y_val, clf.predict(X_val)
    scores = cross_val_score(clf, X_val, y_val, cv=10, scoring='accuracy')
    print("Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    print()
    print(classification_report(y_act, y_pred))
    
    return clf

In [28]:
lgbm_clf = LGBMClassifier(n_estimators=20000,
                          learning_rate=0.005,
                          num_leaves=70,
                          colsample_bytree=.8,
                          subsample=.9,
                          max_depth=7,
                          reg_alpha=.1,
                          reg_lambda=.1,
                          min_split_gain=.01,
                          min_child_weight=2)

lgbm_clf = train_lgbm_clf(lgbm_clf, X_train, y_train, X_val, y_val)

Training until validation scores don't improve for 150 rounds.
[250]	valid_0's auc: 0.736045
[500]	valid_0's auc: 0.741261
[750]	valid_0's auc: 0.746773
[1000]	valid_0's auc: 0.752141
[1250]	valid_0's auc: 0.755854
[1500]	valid_0's auc: 0.758004
[1750]	valid_0's auc: 0.759296
[2000]	valid_0's auc: 0.760196
[2250]	valid_0's auc: 0.760807
[2500]	valid_0's auc: 0.761488
[2750]	valid_0's auc: 0.761967
[3000]	valid_0's auc: 0.762406
[3250]	valid_0's auc: 0.762676
[3500]	valid_0's auc: 0.762877
[3750]	valid_0's auc: 0.763006
[4000]	valid_0's auc: 0.763226
[4250]	valid_0's auc: 0.763379
[4500]	valid_0's auc: 0.763471
[4750]	valid_0's auc: 0.763575
Early stopping, best iteration is:
[4761]	valid_0's auc: 0.763594
Validation Accuracy: 0.92 (+/- 0.00)

             precision    recall  f1-score   support

          0       0.92      1.00      0.96     28327
          1       0.53      0.02      0.04      2424

avg / total       0.89      0.92      0.89     30751



In [46]:
def predict(df, clf, export_path):
    
    # Extract Ids
    ids = df['SK_ID_CURR'].values
    
    # Make predictions
    df = preprocess_data(df)
    X = df.values
    y = clf.predict_proba(X, num_iteration=clf.best_iteration_)[:, 1]
    
    # Combine ids and predictions
    y = np.column_stack((ids, y))
    
    # Restore pandas df
    df_pred = pd.DataFrame(y)
    df_pred.columns = ["SK_ID_CURR", "TARGET"]
    
    df_pred["SK_ID_CURR"] = df_pred["SK_ID_CURR"].astype('int64')
    
    # Export
    df_pred.to_csv(export_path, sep=',', index=False)
    
    return df_pred

In [47]:
df_test = pd.read_csv('./data/application_test.csv', sep=',', header=0)
#df_pred = predict(df_test, rf_clf, './predictions/rf_pred.csv')
df_pred = predict(df_test, lgbm_clf, './predictions/lgbm_pred.csv')

In [48]:
print(df_pred.sum())

SK_ID_CURR    1.354092e+10
TARGET        3.516757e+03
dtype: float64


In [49]:
df_pred.head(3)

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.022392
1,100005,0.106636
2,100013,0.018612
