<a href="https://colab.research.google.com/github/ramesh0805/Major/blob/main/Copy_of_b_Code_CreditScoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing libraries & functions





In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

### Importing dataset

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
dataset=pd.read_excel("/content/a_Dataset_CreditScoring.xlsx")

### Data preparation

# New Section

In [42]:
# shows count of rows and columns
dataset.shape

(3000, 30)

In [43]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [44]:
#dropping customer ID column from the dataset
dataset=dataset.drop('ID',axis=1)
dataset.shape

(3000, 29)

In [45]:
# explore missing values
dataset.isna().sum()

Unnamed: 0,0
TARGET,0
DerogCnt,0
CollectCnt,0
BanruptcyInd,0
InqCnt06,0
InqTimeLast,188
InqFinanceCnt24,0
TLTimeFirst,0
TLTimeLast,0
TLCnt03,0


In [46]:
# filling missing values with mean
dataset=dataset.fillna(dataset.mean())

In [47]:
# explore missing values post missing value fix
dataset.isna().sum()

Unnamed: 0,0
TARGET,0
DerogCnt,0
CollectCnt,0
BanruptcyInd,0
InqCnt06,0
InqTimeLast,0
InqFinanceCnt24,0
TLTimeFirst,0
TLTimeLast,0
TLCnt03,0


In [48]:
# # count of good loans (0) and bad loans (1)
# dataset['TARGET'].value_counts()

In [49]:
# # data summary across 0 & 1
# dataset.groupby('TARGET').mean()

### Train Test Split

In [50]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [51]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

In [52]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [53]:

# Exporting Normalisation Coefficients for later use in prediction
import joblib
joblib.dump(sc, '/content/f2_Normalisation_CreditScoring')

['/content/f2_Normalisation_CreditScoring']

### Risk Model building

In [54]:
classifier =  LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [55]:
# Exporting Logistic Regression Classifier for later use in prediction

# import joblib
joblib.dump(classifier, '/content/f1_Classifier_CreditScoring')

['/content/f1_Classifier_CreditScoring']

### Model *performance*

In [56]:
print(confusion_matrix(y_test,y_pred))

[[487  13]
 [ 87  13]]


In [57]:
print(accuracy_score(y_test, y_pred))

0.8333333333333334


### Writing output file

In [58]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.61598311, 0.38401689],
       [0.98854759, 0.01145241],
       [0.87096661, 0.12903339],
       ...,
       [0.9445091 , 0.0554909 ],
       [0.46946796, 0.53053204],
       [0.94009714, 0.05990286]])

In [60]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)

dfx.to_csv("/content/c1_Model_Prediction.xlsx", sep=',', encoding='UTF-8')

dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.615983,0.384017,0
1,0,0.988548,0.011452,0
2,1,0.870967,0.129033,0
3,0,0.954084,0.045916,0
4,1,0.726195,0.273805,0


In [63]:

# Assuming 'dfx' is your DataFrame and you have a model that produced the 'prob_0' and 'prob_1' columns.

# 1. Adjust the prediction threshold:
#  Currently, the model might be using a default threshold of 0.5.
#  If you examine the distribution of probabilities for correct and incorrect predictions, you may find an optimal threshold other than 0.5 that yields higher accuracy.

import numpy as np
from sklearn.metrics import accuracy_score

# Define a range of potential thresholds to try
thresholds = np.linspace(0,1, 101)
accuracies = []

for t in thresholds:
  predicted_class = (dfx['prob_1'] > t).astype(int)  # Assign class based on threshold
  accuracy = accuracy_score(dfx['Actual Outcome'], predicted_class)
  accuracies.append(accuracy)

best_threshold = thresholds[np.argmax(accuracies)]
print(f"Best threshold: {best_threshold}")

# Apply the best threshold to make predictions
dfx["new_prediction"] = (dfx['prob_1'] > best_threshold).astype(int)
print(f"Accuracy with the best threshold: {accuracy_score(dfx['Actual Outcome'], dfx['new_prediction'])}")


# 2. Feature Engineering (if applicable):
#  If you have access to more features or if you can create new features from the existing ones (interactions, combinations, transformations),  you might improve the model's accuracy.

# 3. Model Tuning:
#  If the model was trained with limited hyperparameter optimization, fine-tune the hyperparameters for the model. This is highly dependent on the actual model used to produce the probabilities in the first place.
#  For example, if you are using a logistic regression model, you could adjust the regularization strength (C parameter).  You could use GridSearchCV to automatically explore this parameter space and find the optimal values.

# 4. More Data:
#  If you can gather more data to train your model (if you are using a predictive model with probabilities) this will increase the model's accuracy.
#  (In this particular case, the probabilities are already provided).

# 5. Ensemble Methods:
#  If possible, consider combining multiple models (ensemble method) to get a better overall prediction.



#Example using a different metric, but the idea is the same. Find a threshold that maximizes a given metric.
from sklearn.metrics import f1_score

f1s = []
for t in thresholds:
    predicted_class = (dfx['prob_1'] > t).astype(int)
    f1 = f1_score(dfx['Actual Outcome'], predicted_class)
    f1s.append(f1)

best_threshold_f1 = thresholds[np.argmax(f1s)]
print(f"Best threshold (based on F1-score): {best_threshold_f1}")



Best threshold: 0.6900000000000001
Accuracy with the best threshold: 0.8366666666666667
Best threshold (based on F1-score): 0.19


### Coding ends here!