In [None]:
import pandas as pd
import numpy as np
import feather
from sklearn.metrics import (auc,accuracy_score, average_precision_score,
                             balanced_accuracy_score,roc_auc_score,
                            roc_curve)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split,RandomizedSearchCV,KFold
import matplotlib.pyplot as plt
import re
import pickle
import lightgbm as lgb
from hyperopt import STATUS_OK
from hyperopt import hp,tpe,Trials,fmin
import csv
from timeit import default_timer as timer
import random
import patsy



In [2]:
df_one_hot = feather.read_dataframe("df_train_one_hot.feather")

In [None]:
df_one_hot.isnull().sum()

## Need to replace all nulls, going to replace with the rest of distirbution of null values in their given column

In [5]:
for col in df_one_hot.columns:
    dist = df_one_hot[col].value_counts(normalize=True,dropna=True)
    missing = df_one_hot[col].isnull()
    df_one_hot.loc[missing,col] = np.random.choice(dist.index,
                                               size=len(df_one_hot[missing]),
                                              p=dist.values)
    

In [6]:
df_one_hot.isna().sum()

SK_ID_CURR                                                 0
TARGET                                                     0
CNT_CHILDREN                                               0
AMT_INCOME_TOTAL                                           0
AMT_CREDIT                                                 0
AMT_ANNUITY                                                0
AMT_GOODS_PRICE                                            0
REGION_POPULATION_RELATIVE                                 0
DAYS_BIRTH                                                 0
DAYS_EMPLOYED                                              0
DAYS_REGISTRATION                                          0
DAYS_ID_PUBLISH                                            0
OWN_CAR_AGE                                                0
FLAG_MOBIL                                                 0
FLAG_EMP_PHONE                                             0
FLAG_WORK_PHONE                                            0
FLAG_CONT_MOBILE        

In [7]:
df_one_hot =df_one_hot[~df_one_hot.isin([np.nan, np.inf, -np.inf]).any(1)]

In [8]:
df_one_hot.shape

(307489, 594)

## Features list generated from feature importances of tree based models, going to use the set of top 20 for RF, XGBoost, and LGBM Boost (42 features) as starting point

In [16]:
with open ("top_feature_importances.pkl", 'rb') as f:
    top = pickle.load(f)

In [17]:
top

['DAYS_EMPLOYED',
 'CODE_GENDER_M',
 'AMT_GOODS_PRICE',
 'DAYS_CREDIT_min',
 'LIVINGAREA_AVG',
 'Income_Credit_Ratio',
 'DAYS_ENTRY_PAYMENT_max',
 'NAME_EDUCATION_TYPE_Higher education',
 'DAYS_CREDIT_UPDATE_max',
 'DAYS_ID_PUBLISH',
 'CREDIT_ACTIVE_Closed_mean',
 'DAYS_CREDIT_ENDDATE_min',
 'NAME_INCOME_TYPE_Working',
 'SK_DPD_DEF_mean_y',
 'DAYS_CREDIT_ENDDATE_mean',
 'AMT_INSTALMENT_max',
 'DAYS_BIRTH',
 'APP_CREDIT_PERC_max',
 'AMT_PAYMENT_min',
 'PAYMENT_PERC_mean',
 'MONTHS_BALANCE_max_y',
 'NAME_CONTRACT_STATUS_Refused_mean_y',
 'DAYS_CREDIT_UPDATE_mean',
 'DAYS_CREDIT_max',
 'DAYS_ENTRY_PAYMENT_sum',
 'NAME_CONTRACT_TYPE_Revolving loans_mean',
 'EXT_SOURCE_2',
 'Annuity_Income_Ratio',
 'CNT_PAYMENT_mean',
 'APP_CREDIT_PERC_mean',
 'DAYS_ENTRY_PAYMENT_mean',
 'AMT_PAYMENT_sum',
 'DAYS_REGISTRATION',
 'AMT_DOWN_PAYMENT_max',
 'EXT_SOURCE_1',
 'EXT_SOURCE_3',
 'AMT_CREDIT',
 'DAYS_DECISION_mean',
 'DAYS_CREDIT_mean',
 'DAYS_CREDIT_ENDDATE_max',
 'FLAG_DOCUMENT_3',
 'AMT_ANNUITY']

In [20]:
df_model_logistic = df_one_hot[['TARGET']+top]

In [21]:
df_model_logistic.shape

(307489, 43)

In [22]:
X = df_model_logistic.drop(['TARGET'],axis=1)
y = df_one_hot['TARGET'].values



In [23]:
X_train, X_test,y_train,y_test = train_test_split(X,y)

In [24]:
del X,y

In [25]:
penalties=['l1','l2','elasticnet']

In [36]:
model = LogisticRegression(penalty='l1',solver='saga',
                               n_jobs=-1,max_iter=200)
model.fit(X_train,y_train)
preds = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test,preds)
print(f"Logistic Regression with penalty: {penalty}\nScore:{auc}\n")
    

Logistic Regression with penalty: l1
Score:0.6134042481373325





In [37]:
df_coefs = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(model.coef_))], axis = 1)



In [38]:
df_coefs.head()

Unnamed: 0,0,0.1
0,DAYS_EMPLOYED,-9.570122e-07
1,CODE_GENDER_M,-4.912026e-10
2,AMT_GOODS_PRICE,-3.561878e-06
3,DAYS_CREDIT_min,-2.449316e-06
4,LIVINGAREA_AVG,-4.36284e-10


In [39]:
df_coefs.columns = ['Feature', "Coeff."]

In [41]:
df_coefs.sort_values(by="Coeff.", ascending=False)

Unnamed: 0,Feature,Coeff.
11,DAYS_CREDIT_ENDDATE_min,4.349471e-06
36,AMT_CREDIT,2.680843e-06
14,DAYS_CREDIT_ENDDATE_mean,7.17322e-07
15,AMT_INSTALMENT_max,4.423382e-07
13,SK_DPD_DEF_mean_y,2.013062e-09
21,NAME_CONTRACT_STATUS_Refused_mean_y,8.360818e-11
25,NAME_CONTRACT_TYPE_Revolving loans_mean,-2.18676e-10
27,Annuity_Income_Ratio,-2.991159e-10
4,LIVINGAREA_AVG,-4.36284e-10
1,CODE_GENDER_M,-4.912026e-10


## Want to take a look at absolute magnitude of the coefficients to get most important ones for this model/eliminate some of the features that have small absolute coefficients

In [43]:
df_coefs['abs Coeff.'] = np.abs(df_coefs['Coeff.'])

In [50]:
df_order_mag = df_coefs.sort_values(by="abs Coeff.",ascending=False).reset_index(drop=True)

In [51]:
df_order_mag

Unnamed: 0,Feature,Coeff.,abs Coeff.
0,DAYS_BIRTH,-5.294414e-05,5.294414e-05
1,AMT_PAYMENT_min,-2.507343e-05,2.507343e-05
2,AMT_ANNUITY,-1.961057e-05,1.961057e-05
3,DAYS_REGISTRATION,-1.824041e-05,1.824041e-05
4,AMT_DOWN_PAYMENT_max,-1.444086e-05,1.444086e-05
5,DAYS_ID_PUBLISH,-1.131022e-05,1.131022e-05
6,DAYS_CREDIT_max,-7.05853e-06,7.05853e-06
7,DAYS_ENTRY_PAYMENT_sum,-5.051369e-06,5.051369e-06
8,DAYS_ENTRY_PAYMENT_max,-4.961268e-06,4.961268e-06
9,DAYS_CREDIT_mean,-4.7754e-06,4.7754e-06


In [57]:
more_important_columns = list(df_order_mag.iloc[:24,0].values)

In [81]:
with open("most_important_columns.pkl",'wb') as f:
    pickle.dump(more_important_columns,f)

In [60]:
df_less_features = df_one_hot[['TARGET']+more_important_columns]

In [61]:
X = df_less_features.drop(['TARGET'],axis=1)
y = df_less_features['TARGET'].values

In [62]:
X_train, X_test,y_train,y_test = train_test_split(X,y)

In [63]:
del X,y

In [72]:
2*np.bincount(y_train.astype(int))

array([424086,  37146])

In [73]:
penalties=['l1','l2','elasticnet']
class_weight = ['balanced',None]

In [75]:
for pen in penalties:
    for weight in class_weight:
        
        model = LogisticRegression(penalty=pen,solver='saga',
                               n_jobs=-1,max_iter=400,
                              l1_ratio=0.5,class_weight=weight)
        model.fit(X_train,y_train)
        preds = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test,preds)
        print(f"Logistic Regression with penalty: {pen}\nClass_Weight = {weight}\nScore = {round(auc,4)}\n")
        

  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))


Logistic Regression with penalty: l1
Class_Weight = balanced
Score = 0.62



  "(penalty={})".format(self.penalty))


Logistic Regression with penalty: l1
Class_Weight = None
Score = 0.6243



  "(penalty={})".format(self.penalty))


Logistic Regression with penalty: l2
Class_Weight = balanced
Score = 0.62





Logistic Regression with penalty: l2
Class_Weight = None
Score = 0.6243





Logistic Regression with penalty: elasticnet
Class_Weight = balanced
Score = 0.62

Logistic Regression with penalty: elasticnet
Class_Weight = None
Score = 0.6243





##  I am going to use best Logistic Model, penalty="elasticnet" and no special class weight parameter

In [76]:
model = LogisticRegression(penalty='elasticnet',solver='saga',
                               n_jobs=-1,max_iter=400,
                              l1_ratio=0.5)
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.5, max_iter=400,
                   multi_class='warn', n_jobs=-1, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [77]:
preds = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test,preds)
auc

0.6243445313849596

In [78]:
#with open("Logistic_Reg_Model.pkl", "wb") as f:
 #   pickle.dump(model,f)

In [79]:
!ls *.pkl

Logistic_Reg_Model.pkl	     xgboost_w_out_clean_null.pkl
top_feature_importances.pkl
