State all imports over here

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import validation_curve

Importing the Dataset

In [None]:
client_info = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv')
client_record = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')

Data Pre-processing

* Dropping Columns
        1. OCCUPATION_TYPE: Lot of Missing Values
        2. CNT_CHILDREN : Highly Correlated to CNT_FAMILY_MEMBERS
        3. DAYS_BIRTH : Highly Correlated to DAYS_EMPLOYED
        4. FLAG_MOBIL : 1 for every data point
        5. FLAG_WORK_PHONE : Similar distribution to FLAG_PHONE
        
* Creating Dummies of Categorical Data
    
* Defining Response Variable, Dropping MONTHS_BALANCE
    
* Sampling the data set
    
* Normalising/ Scaling/ Standardising the data

* DAYS_EMPLOYED > 0 -> Unemployed -> Assign same value to all data points

In [None]:
"""Dropping Columns"""
client_info = client_info.drop(['OCCUPATION_TYPE', "CNT_CHILDREN", "DAYS_BIRTH", "FLAG_MOBIL", "FLAG_WORK_PHONE"], axis=1)


"""Creating Dummies"""
client_info = pd.get_dummies(client_info, columns = ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "NAME_EDUCATION_TYPE", "NAME_HOUSING_TYPE", "NAME_INCOME_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE"], drop_first=True)


"""Handling DAYS_EMPLOYED"""
client_info["DAYS_EMPLOYED"] = np.where(client_info["DAYS_EMPLOYED"] > 0, 1, client_info["DAYS_EMPLOYED"])


"""Defining Response Variable"""
client_record = client_record.drop(["MONTHS_BALANCE"], axis = 1)
client_record['STATUS'] = np.where((client_record['STATUS'] == 'X') | (client_record['STATUS'] == 'C'), -1, client_record['STATUS'])
client_record['STATUS'] = pd.to_numeric(client_record['STATUS'])
client_record = client_record.groupby(["ID"]).mean()
client_record["STATUS"] = np.where(client_record['STATUS']>0, 1, 0)


"""Merging the 2 datasets"""
data = client_record.merge(client_info, how='left', on='ID')
data = data.dropna()
data = data.reset_index()
data = data.drop(columns=['index'])
data = data.set_index('ID')
data = data.astype('float64')
print(data["STATUS"].value_counts())


"""Sampling the data"""
data = data.sample(frac = 1)


"""Creating numpy arrays
   Appending Normalised/Standardised Columns"""

y = data.pop("STATUS")
y = y.to_numpy()


"""Normalising/ Scaling/ Standardising"""

# Normalise
X = MinMaxScaler().fit_transform(data)

"""# Scaling
X = scale(data)"""

"""
# Standardise
"""

"""# Robust Scaling
X = RobustScaler().fit_transform(data)"""


pca_em = PCA(n_components=20).fit_transform(X)
#PCA_components = pd.DataFrame(pca_em)
#plt.scatter(PCA_components[0], PCA_components[1], alpha=.1, color='blue')
#plt.xlabel('PCA 1')
#plt.ylabel('PCA 2')
X = pca_em
#tsne_em = TSNE(n_components=2).fit_transform(pca_em)
#print(tsne_em)

Simple Logestic Regression function 
* inputs : numpy array 
* returns : predicted value

In [None]:
def Mylogistic(X_train,X_test,y_train,y_test):
    clf = LogisticRegression(max_iter=3000).fit(X_train, y_train)
    return(clf.predict(X_test))

Implementing K folds
* inputs : {X, y : numpy array, ratio:int}
* returns :


In [None]:
def split(X,y,ratio):
    
    kf = KFold(n_splits=ratio)
    
    """rfe_selector = RFE(estimator=LogisticRegression(max_iter = 3000), n_features_to_select = 35, step=1)
    rfe_selector.fit(X, y)
    sec = rfe_selector.support_
    X = X[:, sec]"""
    
    for train_index, test_index in kf.split(X):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ypred = Mylogistic(X_train,X_test,y_train,y_test)
        print(accuracy_score(y_test, ypred))
        print(confusion_matrix(y_test, ypred))
        
    
    train_scores, valid_scores = validation_curve(LogisticRegression(max_iter = 3000), X, y, scoring="accuracy", cv=10)
    plt.plot(train_scores)
    plt.plot(valid_scores)
    plt.show()
    
    return 

split(X, y, 10)