### Importing libraries & functions





In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

### Importing dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset=pd.read_excel("/content/drive/My Drive/Project1_Credit_Scoring/a_Dataset_CreditScoring.xlsx")

### Data preparation

In [None]:
# shows count of rows and columns
dataset.shape

(3000, 30)

In [None]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,TLCnt24,TLCnt,TLSum,TLMaxSum,TLSatCnt,TLDel60Cnt,TLBadCnt24,TL75UtilCnt,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,0,66,1,1,0,7,1.0,4,125,3,1,3,5,7.0,14701.0,17312.0,10.0,0,0,3.0,4.0,0.8492,0.6667,0,0,1,0.5833,0,0,0.7143
1,0,116,1,1,0,2,1.0,0,252,18,0,0,2,4.0,5471.0,11375.0,9.0,0,0,1.0,2.0,0.481,0.3,0,1,4,0.4,2,1,0.5
2,0,124,0,0,0,1,1.0,4,254,12,0,1,2,6.0,11522.0,13680.0,11.0,1,1,3.0,3.0,0.8423,0.6667,0,1,1,0.5,1,1,0.3333
3,0,128,0,0,0,6,3.0,6,154,3,1,9,11,9.0,8404.0,11490.0,14.0,1,1,4.0,5.0,0.7314,0.7647,0,1,1,0.5294,1,1,1.2222
4,0,143,0,0,0,1,0.0,1,311,17,0,0,1,5.0,24502.0,27715.0,7.0,0,0,2.0,3.0,0.8841,0.625,0,0,1,0.625,0,0,0.2


In [None]:
#dropping customer ID column from the dataset
dataset=dataset.drop('ID',axis=1)
dataset.shape

(3000, 29)

In [None]:
# explore missing values
dataset.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [None]:
# filling missing values with mean
dataset=dataset.fillna(dataset.mean())

In [None]:
# explore missing values post missing value fix
dataset.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [None]:
# # count of good loans (0) and bad loans (1)
# dataset['TARGET'].value_counts()

In [None]:
# # data summary across 0 & 1
# dataset.groupby('TARGET').mean()

### Train Test Split

In [None]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [None]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Exporting Normalisation Coefficients for later use in prediction

import joblib
joblib.dump(sc, '/content/drive/My Drive/Project1_Credit_Scoring/f2_Normalisation_CreditScoring')

['/content/drive/My Drive/Project1_Credit_Scoring/f2_Normalisation_CreditScoring']

### Risk Model building

In [None]:
classifier =  LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
# Exporting Logistic Regression Classifier for later use in prediction

# import joblib
joblib.dump(classifier, '/content/drive/My Drive/Project1_Credit_Scoring/f1_Classifier_CreditScoring')

['/content/drive/My Drive/Project1_Credit_Scoring/f1_Classifier_CreditScoring']

### Model *performance*

In [None]:
print(confusion_matrix(y_test,y_pred))

[[475  20]
 [ 85  20]]


In [None]:
print(accuracy_score(y_test, y_pred))

0.825


### Writing output file

In [None]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.28630013, 0.71369987],
       [0.9690028 , 0.0309972 ],
       [0.98533771, 0.01466229],
       ...,
       [0.56354729, 0.43645271],
       [0.64359874, 0.35640126],
       [0.92250937, 0.07749063]])

In [None]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)

dfx.to_csv("/content/drive/My Drive/Project1_Credit_Scoring/c1_Model_Prediction.xlsx", sep=',', encoding='UTF-8')

dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.2863,0.7137,1
1,0,0.969003,0.030997,0
2,0,0.985338,0.014662,0
3,0,0.99109,0.00891,0
4,0,0.676561,0.323439,0


### Coding ends here!