In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler,SMOTE
from collections import Counter
import warnings
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('/content/application_data.csv')
data1 = data.copy()
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# The code you provided aims to reduce the memory usage of a DataFrame 'data' by optimizing the data types of its columns.

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                for int_type in [np.int8, np.int16, np.int32, np.int64]:
                    if c_min > np.iinfo(int_type).min and c_max < np.iinfo(int_type).max:
                        df[col] = df[col].astype(int_type)
                        break
            else:
                for float_type in [np.float16, np.float32, np.float64]:
                    if c_min > np.finfo(float_type).min and c_max < np.finfo(float_type).max:
                        df[col] = df[col].astype(float_type)
                        break
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")

    return df

data = reduce_mem_usage(data)

Memory usage of dataframe is 24.22 MB
Memory usage after optimization is: 8.69 MB
Decreased by 64.1%


In [None]:
#create instance of label encoder for all the categorical variables
lab = LabelEncoder()

"""
'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'
"""

#perform label encoding on 'team' column
data['NAME_CONTRACT_TYPE'] = lab.fit_transform(data['NAME_CONTRACT_TYPE'])
data['CODE_GENDER'] = lab.fit_transform(data['CODE_GENDER'])
data['FLAG_OWN_CAR'] = lab.fit_transform(data['FLAG_OWN_CAR'])
data['FLAG_OWN_REALTY'] = lab.fit_transform(data['FLAG_OWN_REALTY'])
data['NAME_TYPE_SUITE'] = lab.fit_transform(data['NAME_TYPE_SUITE'])
data['NAME_INCOME_TYPE'] = lab.fit_transform(data['NAME_INCOME_TYPE'])
data['NAME_EDUCATION_TYPE'] = lab.fit_transform(data['NAME_EDUCATION_TYPE'])
data['NAME_FAMILY_STATUS'] = lab.fit_transform(data['NAME_FAMILY_STATUS'])
data['NAME_HOUSING_TYPE'] = lab.fit_transform(data['NAME_HOUSING_TYPE'])
data['OCCUPATION_TYPE'] = lab.fit_transform(data['OCCUPATION_TYPE'])
data['WEEKDAY_APPR_PROCESS_START'] = lab.fit_transform(data['WEEKDAY_APPR_PROCESS_START'])
data['ORGANIZATION_TYPE'] = lab.fit_transform(data['ORGANIZATION_TYPE'])
data['FONDKAPREMONT_MODE'] = lab.fit_transform(data['FONDKAPREMONT_MODE'])
data['HOUSETYPE_MODE'] = lab.fit_transform(data['HOUSETYPE_MODE'])
data['WALLSMATERIAL_MODE'] = lab.fit_transform(data['WALLSMATERIAL_MODE'])
data['EMERGENCYSTATE_MODE'] = lab.fit_transform(data['EMERGENCYSTATE_MODE'])

In [None]:
def colors(value):
    if (50 < value < 100) or (154000 < value < 250000):
        color = 'red'
    elif value == 1:
        color = 'blue'
    else:
        color = 'green'
    return 'color: %s' % color

def missing(df):
    total = df.isnull().sum().sort_values(ascending=False)
    total = total[total > 0]
    percent = total / len(df) * 100
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percentage']).style.applymap(colors)

missing(data1.select_dtypes('object'))

Unnamed: 0,Total,Percentage
FONDKAPREMONT_MODE,17822,68.490834
WALLSMATERIAL_MODE,13222,50.812805
HOUSETYPE_MODE,13010,49.998078
EMERGENCYSTATE_MODE,12306,47.292571
OCCUPATION_TYPE,8136,31.267054
NAME_TYPE_SUITE,106,0.407363
NAME_FAMILY_STATUS,1,0.003843
NAME_HOUSING_TYPE,1,0.003843
WEEKDAY_APPR_PROCESS_START,1,0.003843
ORGANIZATION_TYPE,1,0.003843


In [None]:
def mode_impute(df,col):
    return df[col].fillna(df[col].mode()[0])
data1['FONDKAPREMONT_MODE'] = mode_impute(data1,'FONDKAPREMONT_MODE')
data1['WALLSMATERIAL_MODE'] = mode_impute(data1,'WALLSMATERIAL_MODE')
data1['HOUSETYPE_MODE'] = mode_impute(data1,'HOUSETYPE_MODE')
data1['EMERGENCYSTATE_MODE'] = mode_impute(data1,'EMERGENCYSTATE_MODE')
data1['OCCUPATION_TYPE'] = mode_impute(data1,'OCCUPATION_TYPE')
data1['NAME_TYPE_SUITE'] = mode_impute(data1,'NAME_TYPE_SUITE')
missing(data1.select_dtypes('object'))

Unnamed: 0,Total,Percentage
NAME_FAMILY_STATUS,1,0.003843
NAME_HOUSING_TYPE,1,0.003843
WEEKDAY_APPR_PROCESS_START,1,0.003843
ORGANIZATION_TYPE,1,0.003843


In [None]:
data1 = data1.select_dtypes('float').interpolate(method ='linear', limit_direction ='forward')
missing(data1.select_dtypes('float'))

Unnamed: 0,Total,Percentage
OWN_CAR_AGE,2,0.007686


In [None]:
data1 = data1.dropna(axis = 1)
missing(data1)

Unnamed: 0,Total,Percentage


In [None]:
data = data.interpolate(method ='linear', limit_direction ='forward')
data = data.dropna(axis = 1)
missing(data)

Unnamed: 0,Total,Percentage


In [None]:
X = data.drop(['TARGET'],axis = 1)
target = data['TARGET']
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size= 0.3, random_state = 0)

In [None]:
def ml_model(X_train,X_test, Y_train, Y_test):


  MLA = [LogisticRegression(),KNeighborsClassifier(),DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100, random_state=42)]
  MLA_columns = []
  MLA_compare = pd.DataFrame(columns = MLA_columns)
  row_index = 0
  for alg in MLA:
    predicted = alg.fit(X_train, Y_train).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train, Y_train), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, Y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(Y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(Y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(Y_test, predicted),2)
    row_index+=1
  MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)
  return MLA_compare
ml_model(X_train,X_test, Y_train, Y_test)

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
0,LogisticRegression,0.92,0.92,0.0,0.0,0.0
3,GaussianNB,0.92,0.92,0.0,0.0,0.0
4,RandomForestClassifier,1.0,0.92,0.39,0.02,0.04
1,KNeighborsClassifier,0.92,0.91,0.19,0.02,0.04
2,DecisionTreeClassifier,1.0,0.85,0.13,0.16,0.15


In [None]:
from sklearn.feature_selection import SelectKBest,mutual_info_classif
bestfeatures = SelectKBest(score_func=mutual_info_classif, k=10)
fit = bestfeatures.fit(X,target,)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
print(featureScores.nlargest(10,'Score'))

               Feature     Score
40        EXT_SOURCE_2  0.014743
41        EXT_SOURCE_3  0.012370
8          AMT_ANNUITY  0.008008
23    FLAG_CONT_MOBILE  0.007725
7           AMT_CREDIT  0.007409
11    NAME_INCOME_TYPE  0.007404
9      AMT_GOODS_PRICE  0.005955
20          FLAG_MOBIL  0.005861
87  WALLSMATERIAL_MODE  0.005338
2          CODE_GENDER  0.005301


In [None]:
X = data[['EXT_SOURCE_2','EXT_SOURCE_3','AMT_ANNUITY','FLAG_CONT_MOBILE','AMT_CREDIT','NAME_INCOME_TYPE','AMT_GOODS_PRICE',
         'FLAG_MOBIL','WALLSMATERIAL_MODE','CODE_GENDER']]
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size= 0.3, random_state = 0)
Feature_selection = ml_model(X_train,X_test, Y_train, Y_test)
Feature_selection

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
0,LogisticRegression,0.92,0.92,0.0,0.0,0.0
3,GaussianNB,0.92,0.92,0.0,0.0,0.0
4,RandomForestClassifier,1.0,0.92,0.32,0.02,0.04
1,KNeighborsClassifier,0.92,0.91,0.19,0.02,0.04
2,DecisionTreeClassifier,1.0,0.85,0.14,0.16,0.15


In [None]:
print('Before Oversampling:',Counter(Y_train))
oversample = RandomOverSampler(sampling_strategy='minority')
X_train1, Y_train1 = oversample.fit_resample(X_train, Y_train)
print('After Oversampling:',Counter(Y_train1))

Before Oversampling: Counter({0: 16761, 1: 1453})
After Oversampling: Counter({0: 16761, 1: 16761})


In [None]:
oversampling = ml_model(X_train1,X_test, Y_train1, Y_test)
oversampling

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
4,RandomForestClassifier,1.0,0.91,0.25,0.05,0.08
2,DecisionTreeClassifier,1.0,0.86,0.13,0.13,0.13
1,KNeighborsClassifier,0.93,0.75,0.12,0.31,0.17
0,LogisticRegression,0.56,0.5,0.1,0.63,0.17
3,GaussianNB,0.52,0.3,0.09,0.82,0.16


In [None]:
print('before SMOTE:',Counter(Y_train))
sm = SMOTE(sampling_strategy='minority')
X_train2, Y_train2 = sm.fit_resample(X_train, Y_train)
print('After SMOTE:',Counter(Y_train2))

before SMOTE: Counter({0: 16761, 1: 1453})
After SMOTE: Counter({0: 16761, 1: 16761})


In [None]:
Smote = ml_model(X_train2,X_test, Y_train2, Y_test)
Smote

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
4,RandomForestClassifier,1.0,0.82,0.14,0.24,0.18
1,KNeighborsClassifier,0.88,0.75,0.11,0.3,0.16
2,DecisionTreeClassifier,1.0,0.75,0.11,0.31,0.17
0,LogisticRegression,0.56,0.5,0.1,0.62,0.17
3,GaussianNB,0.53,0.31,0.09,0.82,0.16
