In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Importing Data Set
app = pd.read_csv("application_train.csv")

In [4]:
def missing_values(df):
    n_miss_val = df.isnull().sum()
    n_miss_per = 100 * df.isnull().sum() / len(df)
    miss_tbl = pd.concat([n_miss_val,n_miss_per],axis=1).sort_values(1,ascending=False).round(1)
    miss_tbl = miss_tbl[miss_tbl[1] !=0]
    
    miss_tbl = miss_tbl.rename(columns ={0: 'Missing Values',1:'%(Percentage) Missing Values'})
    print("{} columns that have missing values.".format(miss_tbl.shape[0]))
    
    return miss_tbl

In [5]:
#Missing value table using above defined function
missing_values_table = missing_values(app)
missing_values_table.head()

64 columns that have missing values.


Unnamed: 0,Missing Values,%(Percentage) Missing Values
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
NONLIVINGAPARTMENTS_MODE,213514,69.4
NONLIVINGAPARTMENTS_AVG,213514,69.4


In [6]:
#Drop features which contain more than 65% missing values.
app.dropna(thresh=app.shape[0]*0.65,how='all',axis=1,inplace=True)
app.shape

(307511, 73)

In [7]:
#MIssing Value Imputation
app['AMT_ANNUITY'].fillna(app['AMT_ANNUITY'].median(),inplace=True)
app['AMT_GOODS_PRICE'].fillna(app['AMT_CREDIT'].median(),inplace=True) 
app['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0,inplace=True)
app['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0,inplace=True)
app['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0,inplace=True)
app['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0,inplace=True)
app['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0,inplace=True)
app['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0,inplace=True)
app['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0,inplace=True)
app['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(app['DEF_30_CNT_SOCIAL_CIRCLE'].median(),inplace=True)
app['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(app['DEF_30_CNT_SOCIAL_CIRCLE'].median(),inplace=True)
app['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(app['OBS_30_CNT_SOCIAL_CIRCLE'].median(),inplace=True)
app['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(app['OBS_30_CNT_SOCIAL_CIRCLE'].median(),inplace=True)
app['EXT_SOURCE_2'].fillna(app['EXT_SOURCE_2'].mean(),inplace=True)
app['EXT_SOURCE_3'].fillna(app['EXT_SOURCE_3'].mean(),inplace=True)
app['NAME_TYPE_SUITE'].fillna("Unaccompanied",inplace=True)
app['OCCUPATION_TYPE'].fillna("Unknown",inplace=True)


In [8]:
app.isna().sum().head(30)

SK_ID_CURR                    0
TARGET                        0
NAME_CONTRACT_TYPE            0
CODE_GENDER                   0
FLAG_OWN_CAR                  0
FLAG_OWN_REALTY               0
CNT_CHILDREN                  0
AMT_INCOME_TOTAL              0
AMT_CREDIT                    0
AMT_ANNUITY                   0
AMT_GOODS_PRICE               0
NAME_TYPE_SUITE               0
NAME_INCOME_TYPE              0
NAME_EDUCATION_TYPE           0
NAME_FAMILY_STATUS            0
NAME_HOUSING_TYPE             0
REGION_POPULATION_RELATIVE    0
DAYS_BIRTH                    0
DAYS_EMPLOYED                 0
DAYS_REGISTRATION             0
DAYS_ID_PUBLISH               0
FLAG_MOBIL                    0
FLAG_EMP_PHONE                0
FLAG_WORK_PHONE               0
FLAG_CONT_MOBILE              0
FLAG_PHONE                    0
FLAG_EMAIL                    0
OCCUPATION_TYPE               0
CNT_FAM_MEMBERS               2
REGION_RATING_CLIENT          0
dtype: int64

In [9]:
app.isna().sum().tail(45)

CNT_FAM_MEMBERS                2
REGION_RATING_CLIENT           0
REGION_RATING_CLIENT_W_CITY    0
WEEKDAY_APPR_PROCESS_START     0
HOUR_APPR_PROCESS_START        0
REG_REGION_NOT_LIVE_REGION     0
REG_REGION_NOT_WORK_REGION     0
LIVE_REGION_NOT_WORK_REGION    0
REG_CITY_NOT_LIVE_CITY         0
REG_CITY_NOT_WORK_CITY         0
LIVE_CITY_NOT_WORK_CITY        0
ORGANIZATION_TYPE              0
EXT_SOURCE_2                   0
EXT_SOURCE_3                   0
OBS_30_CNT_SOCIAL_CIRCLE       0
DEF_30_CNT_SOCIAL_CIRCLE       0
OBS_60_CNT_SOCIAL_CIRCLE       0
DEF_60_CNT_SOCIAL_CIRCLE       0
DAYS_LAST_PHONE_CHANGE         1
FLAG_DOCUMENT_2                0
FLAG_DOCUMENT_3                0
FLAG_DOCUMENT_4                0
FLAG_DOCUMENT_5                0
FLAG_DOCUMENT_6                0
FLAG_DOCUMENT_7                0
FLAG_DOCUMENT_8                0
FLAG_DOCUMENT_9                0
FLAG_DOCUMENT_10               0
FLAG_DOCUMENT_11               0
FLAG_DOCUMENT_12               0
FLAG_DOCUM

In [10]:
app.dropna(inplace= True)

In [11]:
app.shape

(307508, 73)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

### Simple Oversampling

In [13]:
#Balancing Data Set by oversampling.
class_count_0,class_count_1 = app["TARGET"].value_counts()

class_0 = app[app['TARGET'] == 0]
class_1 = app[app['TARGET'] == 1]

class_1_over = class_1.sample(class_count_0, replace=True)
test_over = pd.concat([class_1_over,class_0],axis=0)

app = test_over
y = app.TARGET
y.value_counts()/len(y)*100

1    50.0
0    50.0
Name: TARGET, dtype: float64

In [14]:
class_count_0,class_count_1 = app["TARGET"].value_counts()

In [15]:
class_count_0

282683

In [16]:
class_count_1

282683

In [17]:
#Storing Target variable in y
y = app.TARGET
customerid= app.SK_ID_CURR

In [18]:
app.drop(['TARGET','SK_ID_CURR'],axis=1,inplace=True)

In [19]:
#To Train-Tests split
from sklearn.model_selection import train_test_split
#Feature Scaling for continuous random variable
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [20]:
#Extarct numeric features from data.
Num = app.select_dtypes(include=[np.number])
Num.shape

(565366, 59)

In [21]:
app[Num.columns]= sc.fit_transform(app[Num.columns])
app.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
103777,Cash loans,M,N,Y,0.758582,-0.061442,0.747078,0.726458,0.526288,Family,...,-0.084773,-0.02308,-0.022418,-0.021117,-0.071374,-0.063141,-0.155955,-0.26993,-0.327457,-0.879739
82331,Revolving loans,F,N,N,-0.593458,-0.104308,-0.820098,-0.980078,-0.713013,Unaccompanied,...,-0.084773,-0.02308,-0.022418,-0.021117,-0.071374,-0.063141,-0.155955,-0.26993,-0.327457,0.182965
132888,Cash loans,M,Y,N,-0.593458,-0.104308,-0.173448,-0.040093,-0.11293,Unaccompanied,...,-0.084773,-0.02308,-0.022418,-0.021117,-0.071374,-0.063141,-0.155955,-0.26993,-0.327457,-0.879739
61253,Cash loans,F,N,N,-0.593458,0.067155,0.674109,-0.166527,0.526288,Unaccompanied,...,-0.084773,-0.02308,-0.022418,-0.021117,-0.071374,-0.063141,-0.155955,-0.26993,-0.327457,0.182965
264674,Cash loans,F,N,Y,-0.593458,0.367215,1.78576,1.310639,1.113326,Unaccompanied,...,-0.084773,-0.02308,-0.022418,-0.021117,-0.071374,-0.063141,-0.155955,-0.26993,1.143244,-0.879739


In [22]:
Data=pd.get_dummies(app)
Data.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
103777,0.758582,-0.061442,0.747078,0.726458,0.526288,-0.110395,0.432126,-0.436116,-0.274677,-1.059826,...,0,0,0,0,0,0,0,0,0,0
82331,-0.593458,-0.104308,-0.820098,-0.980078,-0.713013,-0.910231,-0.690068,-0.414704,0.291358,0.546995,...,0,0,0,0,0,0,0,0,0,0
132888,-0.593458,-0.104308,-0.173448,-0.040093,-0.11293,0.196424,-2.120835,2.351171,-0.172768,-0.795204,...,0,0,0,0,0,0,0,0,0,1
61253,-0.593458,0.067155,0.674109,-0.166527,0.526288,-0.771697,-0.652854,-0.492255,-1.50953,0.6589,...,0,0,0,0,0,0,0,0,0,0
264674,-0.593458,0.367215,1.78576,1.310639,1.113326,-1.063755,0.447843,-0.414992,0.60786,1.012387,...,0,0,0,0,0,0,0,0,0,0


In [23]:
X=Data
Y=y

In [24]:
#Spiltting the data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((395756, 183), (169610, 183), (395756,), (169610,))

### Simple Logistic Regression

In [25]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression()

In [26]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.68


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[58547 26555]
 [27131 57377]]


In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.69      0.69     85102
           1       0.68      0.68      0.68     84508

    accuracy                           0.68    169610
   macro avg       0.68      0.68      0.68    169610
weighted avg       0.68      0.68      0.68    169610



### Stepwise Regression

In [None]:
f=['EXT_SOURCE_3','EXT_SOURCE_2','ORGANIZATION_TYPE_Advertising', 'ORGANIZATION_TYPE_Agriculture', 'ORGANIZATION_TYPE_Bank', 'ORGANIZATION_TYPE_Business Entity Type 1', 'ORGANIZATION_TYPE_Business Entity Type 2', 'ORGANIZATION_TYPE_Business Entity Type 3', 'ORGANIZATION_TYPE_Cleaning', 'ORGANIZATION_TYPE_Construction', 'ORGANIZATION_TYPE_Culture', 'ORGANIZATION_TYPE_Electricity', 'ORGANIZATION_TYPE_Emergency', 'ORGANIZATION_TYPE_Government', 'ORGANIZATION_TYPE_Hotel', 'ORGANIZATION_TYPE_Housing', 'ORGANIZATION_TYPE_Industry: type 1', 'ORGANIZATION_TYPE_Industry: type 10', 'ORGANIZATION_TYPE_Industry: type 11', 'ORGANIZATION_TYPE_Industry: type 12', 'ORGANIZATION_TYPE_Industry: type 13', 'ORGANIZATION_TYPE_Industry: type 2', 'ORGANIZATION_TYPE_Industry: type 3', 'ORGANIZATION_TYPE_Industry: type 4', 'ORGANIZATION_TYPE_Industry: type 5', 'ORGANIZATION_TYPE_Industry: type 6', 'ORGANIZATION_TYPE_Industry: type 7', 'ORGANIZATION_TYPE_Industry: type 8', 'ORGANIZATION_TYPE_Industry: type 9', 'ORGANIZATION_TYPE_Insurance', 'ORGANIZATION_TYPE_Kindergarten', 'ORGANIZATION_TYPE_Legal Services', 'ORGANIZATION_TYPE_Medicine', 'ORGANIZATION_TYPE_Military', 'ORGANIZATION_TYPE_Mobile', 'ORGANIZATION_TYPE_Other', 'ORGANIZATION_TYPE_Police', 'ORGANIZATION_TYPE_Postal', 'ORGANIZATION_TYPE_Realtor', 'ORGANIZATION_TYPE_Religion', 'ORGANIZATION_TYPE_Restaurant', 'ORGANIZATION_TYPE_School', 'ORGANIZATION_TYPE_Security', 'ORGANIZATION_TYPE_Security Ministries', 'ORGANIZATION_TYPE_Self-employed', 'ORGANIZATION_TYPE_Services', 'ORGANIZATION_TYPE_Telecom', 'ORGANIZATION_TYPE_Trade: type 1', 'ORGANIZATION_TYPE_Trade: type 2', 'ORGANIZATION_TYPE_Trade: type 3', 'ORGANIZATION_TYPE_Trade: type 4', 'ORGANIZATION_TYPE_Trade: type 5', 'ORGANIZATION_TYPE_Trade: type 6', 'ORGANIZATION_TYPE_Trade: type 7', 'ORGANIZATION_TYPE_Transport: type 1', 'ORGANIZATION_TYPE_Transport: type 2', 'ORGANIZATION_TYPE_Transport: type 3', 'ORGANIZATION_TYPE_Transport: type 4', 'ORGANIZATION_TYPE_University', 'ORGANIZATION_TYPE_XNA','OCCUPATION_TYPE_Accountants', 'OCCUPATION_TYPE_Cleaning staff', 'OCCUPATION_TYPE_Cooking staff', 'OCCUPATION_TYPE_Core staff', 'OCCUPATION_TYPE_Drivers', 'OCCUPATION_TYPE_HR staff', 'OCCUPATION_TYPE_High skill tech staff', 'OCCUPATION_TYPE_IT staff', 'OCCUPATION_TYPE_Laborers', 'OCCUPATION_TYPE_Low-skill Laborers', 'OCCUPATION_TYPE_Managers', 'OCCUPATION_TYPE_Medicine staff', 'OCCUPATION_TYPE_Private service staff', 'OCCUPATION_TYPE_Realty agents', 'OCCUPATION_TYPE_Sales staff', 'OCCUPATION_TYPE_Secretaries', 'OCCUPATION_TYPE_Security staff', 'OCCUPATION_TYPE_Unknown', 'OCCUPATION_TYPE_Waiters/barmen staff','DAYS_EMPLOYED','NAME_EDUCATION_TYPE_Academic degree', 'NAME_EDUCATION_TYPE_Higher education', 'NAME_EDUCATION_TYPE_Incomplete higher', 'NAME_EDUCATION_TYPE_Lower secondary', 'NAME_EDUCATION_TYPE_Secondary / secondary special','NAME_CONTRACT_TYPE_Cash loans', 'NAME_CONTRACT_TYPE_Revolving loans','CODE_GENDER_F', 'CODE_GENDER_M', 'CODE_GENDER_XNA']

In [None]:
X1 = X[f]

In [None]:
X.head()

### Using the features from stepwise regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### Lasso Regularization using Grid Search

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1"]}# l1 lasso l2 ridge
logreg=LogisticRegression(solver = 'liblinear')

In [None]:
logreg_cv=GridSearchCV(logreg,grid,cv=3,scoring='f1')
logreg_cv.fit(X_train, y_train)

In [None]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("best score :",logreg_cv.best_score_)

In [None]:
sel= SelectFromModel(LogisticRegression(penalty='l1',C=0.1,solver='liblinear'))
sel.fit(X_train,y_train)
sel.get_support()

In [None]:
y_pred=sel.predict(X_test)

In [None]:
a=sel.estimator_.coef_
a.shape
a

In [None]:
X_train.columns[sel.get_support()]

In [None]:
features=list(X_train.columns[sel.get_support()])
print(features)

In [None]:
X1 = X[features]

In [None]:
X1.head()

### Using the features from lasso regularization

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### Random Forest using Grid search

In [22]:
X=Data
Y=y

In [41]:
f1 = list(Data.columns)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
skfold=StratifiedKFold(n_splits=15)
model=RandomForestClassifier()
scores=cross_val_score(model,X,y,cv=skfold)
print(np.mean(scores))

0.9193354322989598


In [24]:
from sklearn.feature_selection import SelectFromModel

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [27]:
sel = SelectFromModel(RandomForestClassifier())
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier())

In [63]:
sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True, False,  True,  True,
        True,  True, False, False, False, False, False, False,  True,
        True,  True,  True,  True, False,  True, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False,  True, False,
       False,  True,  True,  True,  True, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False,  True, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True, False, False,  True,  True,  True, False,
       False, False,

In [64]:
features=list(X_train.columns[sel.get_support()])
print(features)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 122 but corresponding boolean dimension is 183

In [44]:
X1 = X[features]

In [45]:
X1.shape

(565366, 122)

In [46]:
X1.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
275888,-0.592525,-0.079785,-0.687883,-0.540891,-0.791004,2.007537,-1.321774,-0.503421,-0.69629,-1.009216,...,0,0,0,0,0,0,0,1,0,0
84061,-0.592525,-0.10265,-0.01809,0.066599,-0.152237,2.007537,0.89329,-0.41748,-0.101302,0.255698,...,0,0,0,0,0,0,0,0,0,0
293094,-0.592525,0.03454,0.249597,-0.579624,0.460458,0.863549,-0.520458,-0.42324,-0.075977,1.104462,...,0,0,0,0,0,0,0,0,0,0
154329,-0.592525,0.377516,-1.295184,-1.250346,-1.23423,-0.098904,0.558522,-0.418727,1.182697,-0.237493,...,0,0,0,0,0,0,0,0,0,0
89966,-0.592525,-0.136947,-0.701037,-0.814344,-0.582427,-0.158212,-0.347294,-0.431267,1.306992,1.519955,...,0,0,0,0,0,0,0,0,0,0


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X1, Y, test_size=0.25, random_state=0)

### Using the features from Random Forest

In [54]:
from sklearn.linear_model import LogisticRegression

In [55]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression()

In [56]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.68


In [57]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[48674 22282]
 [22981 47405]]


In [58]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.69      0.68     70956
           1       0.68      0.67      0.68     70386

    accuracy                           0.68    141342
   macro avg       0.68      0.68      0.68    141342
weighted avg       0.68      0.68      0.68    141342

