In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Projectnew_Credit_Scoring
# !pwd

/content/drive/MyDrive/Projectnew_Credit_Scoring


In [None]:
dataset=pd.read_excel("/content/DATASET PREPARATION.xlsx")
# !ls

# New Section

In [None]:
# shows count of rows and columns
dataset.shape

(3000, 30)

In [None]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,0,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,0,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,0,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,0,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,0,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [None]:
#dropping customer ID column from the dataset
dataset=dataset.drop('ID',axis=1)
dataset.shape

(3000, 29)

In [None]:
# explore missing values
dataset.isna().sum()

TARGET                   0
DerogCnt                 0
CollectCnt               0
BanruptcyInd             0
InqCnt06                 0
InqTimeLast            188
InqFinanceCnt24          0
TLTimeFirst              0
TLTimeLast               0
TLCnt03                  0
TLCnt12                  0
TLCnt24                  0
TLSum (in rupees)        0
TLMaxSum(in rupees)      0
TLCnt                    3
TLSatCnt                 4
TLDel60Cnt               0
TLBadCnt24               0
TL75UtilCnt             99
TL50UtilCnt             99
TLBalHCPct              41
TLSatPct                 4
TLDel3060Cnt24           0
TLDel90Cnt24             0
TLDel60CntAll            0
TLOpenPct                3
TLBadDerogCnt            0
TLDel60Cnt24             0
TLOpen24Pct              3
dtype: int64

In [None]:
# filling the missing values with mean
dataset=dataset.fillna(dataset.mean())

In [None]:
# explore missing values post missing value fix
dataset.isna().sum()

TARGET                 0
DerogCnt               0
CollectCnt             0
BanruptcyInd           0
InqCnt06               0
InqTimeLast            0
InqFinanceCnt24        0
TLTimeFirst            0
TLTimeLast             0
TLCnt03                0
TLCnt12                0
TLCnt24                0
TLSum (in rupees)      0
TLMaxSum(in rupees)    0
TLCnt                  0
TLSatCnt               0
TLDel60Cnt             0
TLBadCnt24             0
TL75UtilCnt            0
TL50UtilCnt            0
TLBalHCPct             0
TLSatPct               0
TLDel3060Cnt24         0
TLDel90Cnt24           0
TLDel60CntAll          0
TLOpenPct              0
TLBadDerogCnt          0
TLDel60Cnt24           0
TLOpen24Pct            0
dtype: int64

In [None]:
# count the number of good loans(1) and bad loans(0)
dataset['TARGET'].value_counts()

1    2500
0     500
Name: TARGET, dtype: int64

In [None]:
dataset.groupby('TARGET').mean()

Unnamed: 0_level_0,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.968,1.31,0.174,3.938,2.775459,4.882,155.672,12.992,0.228,1.768,...,4.53387,0.748185,0.385173,1.334,1.576,4.014,0.465127,2.554,2.086,0.600978
1,1.3224,0.7664,0.1492,2.9424,3.174638,3.2896,173.002,11.65,0.2844,1.832,...,3.986711,0.628177,0.544963,0.6044,0.6624,2.2236,0.502376,1.18,0.8648,0.556867


In [None]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [None]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Exporting Normalisation Coefficients for later use in prediction
import joblib
joblib.dump(sc, '/content/drive/My Drive/Project1_Credit_Scoring/f2_Normalisation_CreditScoring')

['/content/drive/My Drive/Project1_Credit_Scoring/f2_Normalisation_CreditScoring']

In [None]:
classifier =  LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
# Exporting Logistic Regression Classifier for later use in prediction

# import joblib
joblib.dump(classifier, '/content/drive/My Drive/Projectnew_Credit_Scoring/f1_Classifier_CreditScoring')

['/content/drive/My Drive/Projectnew_Credit_Scoring/f1_Classifier_CreditScoring']

In [None]:
print(confusion_matrix(y_test,y_pred))

[[ 16  84]
 [ 12 488]]


In [None]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.84
0.8531468531468531
0.976
0.9104477611940298


In [None]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.0150251 , 0.9849749 ],
       [0.06190025, 0.93809975],
       [0.07515409, 0.92484591],
       ...,
       [0.0497855 , 0.9502145 ],
       [0.16204709, 0.83795291],
       [0.06345689, 0.93654311]])

In [None]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)

dfx.to_excel("/content/BLANK.xlsx", index=False)
dfx.head()


Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.015025,0.984975,1
1,1,0.0619,0.9381,1
2,1,0.075154,0.924846,1
3,0,0.111948,0.888052,1
4,1,0.124208,0.875792,1
