In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
np.random.seed(203)
from tqdm import tqdm
import datetime
from collections import Counter
import re

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

import lightgbm as lgb
from sklearn.metrics import precision_score , recall_score , f1_score
from scipy.spatial.distance import mahalanobis

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
data_train = pd.read_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/CAX_MortgageModeling_Train.csv')
data_test =  pd.read_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/CAX_MortgageModeling_Test.csv')
data_sub =   pd.read_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/CAX_MortgageModeling_SubmissionFormat.csv')

In [4]:
data_train = data_train.drop(['Unique_ID','MORTGAGE NUMBER'],axis=1)
data_test =  data_test.drop(['Unique_ID','MORTGAGE NUMBER','RESULT'], axis=1)

In [5]:
#Train data separating the response
y       = data_train['RESULT']
x_train = data_train.drop(['RESULT'],axis=1)


In [6]:
#Test data and train data merged for pre-processing
x_comb = pd.concat([x_train,data_test],sort=False)

In [7]:
cont_cols = ['PROPERTY VALUE','MORTGAGE PAYMENT','GDS','LTV','TDS','AMORTIZATION','MORTGAGE AMOUNT',
             'INCOME','CREDIT SCORE','RATE']

In [8]:
cat_cols = ['PAYMENT FREQUENCY','PROPERTY TYPE','FSA','TERM','AGE RANGE','GENDER','INCOME TYPE',
           'NAICS CODE','MORTGAGE PURPOSE']

In [9]:
log_transformed = ['PROPERTY VALUE','MORTGAGE PAYMENT','LTV','INCOME','CREDIT SCORE','MORTGAGE AMOUNT']

In [10]:
non_log_cols = ['GDS','LTV','TDS','AMORTIZATION','RATE']

In [11]:
for col in tqdm(cat_cols):
    le = LabelEncoder()
    x_comb[col] = le.fit_transform(x_comb[col])

100%|██████████| 9/9 [00:00<00:00, 99.81it/s]


In [12]:
#Label Encoding the y-var
le_y = LabelEncoder()

y_cat = le_y.fit_transform(y)

In [None]:
for col in log_transformed:
    x_comb[col] = np.log1p(x_comb[col])


In [None]:
col_count = []

for i,col in tqdm(enumerate(cont_cols)):
    counter = Counter(x_comb[col])
    col_count.append(str(col)+'count')
    x_comb[str(col)+'count'] = x_comb[col].apply(lambda x:counter[x])

In [None]:
train_x = x_comb[0:45642]
test_x =  x_comb[45642:]

In [None]:
train_x[y_cat==0].shape

In [None]:
train_x[y_cat==1].shape

In [None]:
x1_train, x1_val, y1_train, y1_val = train_test_split(x_smote, y_train_smote,test_size=0.2,random_state=0,
                                                      stratify=y)

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [None]:
random_state = 42
np.random.seed(random_state)

lgb_params = {
    "objective" : "binary",
    'metric':{lgb_f1_score},
    "boosting": 'gbdt',
    "max_depth" : 10,
    "num_leaves" : 20,
    "learning_rate" : 0.02,
    "bagging_freq": 5,
    "bagging_fraction" : 0.60,
    "feature_fraction" : 0.1,
    "min_data_in_leaf":  40,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    'n_esimators' : 1000,
    'min_split_gain': .001,
    'reg_alpha': .001,
    'reg_lambda': .001,
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "is_unbalance":'TRUE',
    "seed": random_state
}

In [None]:
trn_data = lgb.Dataset(x1_train, label= y1_train, categorical_feature= cat_cols)
val_data = lgb.Dataset(x1_val,  label=  y1_val,   categorical_feature= cat_cols)

In [None]:
lgb_clf = lgb.train(lgb_params,
                    trn_data,
                    13000,
                    valid_sets = [trn_data, val_data],
                    early_stopping_rounds=3000,
                    feval=lgb_f1_score
                    )

In [None]:
#training's f1: 0.583633	valid_1's f1: 0.478433

In [None]:
### funded : 0,  not funded 1
best_thresh = None
val_score = 0

for thresh in np.linspace(start = 0.4, stop = 0.8, num = 40):
    
    val_preds = lgb_clf.predict(x1_val, num_iteration = lgb_clf.best_iteration)
    ans = [1 if (val > thresh) else 0 for val in val_preds]
    
    score = f1_score(y1_val, ans, average='macro')
    print("Thresh", thresh, "F1 score:", score)
    
    if score > val_score:
        val_score = score
        best_thresh = thresh

        
print("\nThresh", best_thresh, "F1 score:", val_score)

In [None]:
pred = lgb_clf.predict(test_x)

In [None]:
y_pred = ['NOT FUNDED' if x >= 0.59 else 'FUNDED' for x in pred]

In [None]:
data_sub['Result_Predicted'] = y_pred

In [None]:
data_sub.to_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/Submission4_smote.csv',index=False)