In [72]:
# import libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [8]:
df = pd.read_excel('CreditWorthiness.xlsx')

In [86]:
# import models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [68]:
df.shape

(1000, 21)

In [20]:
df.sample(10)


Unnamed: 0,Cbal,Cdur,Chist,Cpur,Camt,Sbal,Edur,InRate,MSG,Oparties,...,Prop,age,inPlans,Htype,NumCred,JobType,Ndepend,telephone,foreign,creditScore
627,0 <= Rs. < 2000,36,all settled till now,second hand vehicle,28500,"1000 <= Rs. < 5,000",more than 7 years,4,single male,no one,...,Unknown,30,none,free,1,employee with official position,1,no,no,good
951,Rs. < 0,48,all settled till now,Business,42960,Rs. < 1000,less than 1 year,3,divorced or separated or married female,no one,...,life insurance/building society,24,none,pays rent,1,employee with official position,1,no,no,bad
146,no checking account,12,dues not paid earlier,furniture,19230,Rs. < 1000,more than 7 years,4,single male,no one,...,real estate,43,none,own,3,employee with official position,1,yes,no,good
460,no checking account,24,dues not paid earlier,renovation,54950,Rs. < 1000,more than 7 years,3,single male,no one,...,Unknown,44,none,free,2,employee with official position,1,no,no,good
406,Rs. < 0,24,all settled till now,electronics,16470,Rs. < 1000,less than 1 year,4,divorced or separated or married female,no one,...,Other cars etc.,29,none,pays rent,1,resident unskilled,1,yes,no,bad
542,0 <= Rs. < 2000,24,all settled,education,18250,Rs. < 1000,4 to 7 years,4,divorced or separated or married female,no one,...,Unknown,34,bank,free,1,resident unskilled,1,no,no,bad
984,Rs. < 0,48,dues not paid earlier,new vehicle,61310,Rs. < 1000,more than 7 years,4,divorced or separated or married female,no one,...,Unknown,58,stores,free,2,resident unskilled,1,no,no,bad
601,Rs. < 0,12,all settled till now,second hand vehicle,8880,no savings account,1 to 4 years,4,married or widowed male,no one,...,Other cars etc.,23,none,own,1,employee with official position,1,no,no,bad
868,no checking account,30,all settled till now,electronics,18550,no savings account,more than 7 years,4,single male,no one,...,Other cars etc.,58,none,own,1,employee with official position,1,yes,no,good
679,0 <= Rs. < 2000,30,none taken/all settled,Business,42680,"1000 <= Rs. < 5,000",1 to 4 years,4,divorced or separated or married female,no one,...,Other cars etc.,26,none,pays rent,2,resident unskilled,1,no,no,bad


In [21]:
df.describe()

Unnamed: 0,Cdur,Camt,InRate,age,NumCred,Ndepend
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,32592.58,2.973,35.546,1.407,1.155
std,12.058814,28227.36876,1.118715,11.375469,0.577654,0.362086
min,4.0,2380.0,1.0,19.0,1.0,1.0
25%,12.0,13535.0,2.0,27.0,1.0,1.0
50%,18.0,23075.0,3.0,33.0,1.0,1.0
75%,24.0,39602.5,4.0,42.0,2.0,1.0
max,72.0,184120.0,4.0,75.0,4.0,2.0


In [36]:
df.nunique()



Cbal             4
Cdur            33
Chist            4
Cpur            10
Camt           921
Sbal             5
Edur             5
InRate           4
MSG              4
Oparties         3
Rdur             4
Prop             4
age             53
inPlans          3
Htype            3
NumCred          4
JobType          4
Ndepend          2
telephone        2
foreign          2
creditScore      2
dtype: int64

In [59]:
df['Cbal'].unique()



array(['0 <= Rs. < 2000', 'no checking account', ' Rs. < 0', 'Rs. >=2000'],
      dtype=object)

In [112]:
# apply column transformer for encoding and feature scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('tnf1', OrdinalEncoder(categories=[['no checking account', ' Rs. < 0', '0 <= Rs. < 2000', 'Rs. >=2000']]), ['Cbal']),
        ('tnf2', OneHotEncoder(sparse=False, drop='first'), ['Chist', 'Cpur', 'Edur', 'MSG', 'Oparties', 'Rdur', 'Prop', 'inPlans', 'Htype', 'JobType']),
        ('tnf3', OrdinalEncoder(categories=[['no savings account', 'Rs. < 1000', '1000 <= Rs. < 5,000', '5000 <= Rs. < 10,000', 'Rs. >= 10,000']]), ['Sbal']),
        ('tnf4', OneHotEncoder(sparse=False, drop='if_binary'), ['telephone', 'foreign']),
        ('tnf5', StandardScaler(), ['Cdur', 'Camt', 'InRate', 'age', 'NumCred', 'Ndepend'])
    ],
    remainder='passthrough'
)

In [113]:
X = df.drop('creditScore', axis=1)
y = df['creditScore']

In [114]:
X_transformed = preprocessor.fit_transform(X)



In [115]:
X_transformed.shape

(1000, 44)

In [116]:
# tranform the data
ordinal_features = ['Cbal', 'Sbal']
categorical_features = ['Chist', 'Cpur', 'Edur', 'MSG', 'Oparties', 'Rdur', 'Prop', 'inPlans', 'Htype', 'JobType']
binary_features = ['telephone', 'foreign']
numeric_features = [col for col in X.columns if col not in ordinal_features + categorical_features + binary_features]

# Construct the feature names
ordinal_feature_names = ordinal_features
categorical_feature_names = preprocessor.named_transformers_['tnf2'].get_feature_names_out(categorical_features).tolist()
binary_feature_names = preprocessor.named_transformers_['tnf4'].get_feature_names_out(binary_features).tolist()
all_feature_names = ordinal_feature_names + categorical_feature_names + binary_feature_names + numeric_features

# Create DataFrame from the transformed data
X_transformed = pd.DataFrame(X_transformed, columns=all_feature_names)

In [117]:
X_transformed.head()

Unnamed: 0,Cbal,Sbal,Chist_all settled till now,Chist_dues not paid earlier,Chist_none taken/all settled,Cpur_domestic needs,Cpur_education,Cpur_electronics,Cpur_furniture,Cpur_miscellaneous,...,JobType_non resident either unemployed or unskilled,JobType_resident unskilled,telephone_yes,foreign_yes,Cdur,Camt,InRate,age,NumCred,Ndepend
0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-0.987573,-0.666445,-0.870183,-0.751642,-0.704926,-0.42829
1,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-0.489762,-0.614696,0.918477,1.271265,1.027079,-0.42829
2,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.252574,-0.467248,0.918477,2.238742,-0.704926,-0.42829
3,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,2.248194,3.951952,-0.870183,-0.927547,-0.704926,-0.42829
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.256953,-0.031991,0.918477,-0.839594,-0.704926,-0.42829


In [118]:
# transform the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [119]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)


In [120]:
# apply and evaluate models
# Dictionary to store the models and their names
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}



In [121]:
# Function to train, predict and evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    return accuracy, conf_matrix, class_report


In [122]:

# Evaluate each model and print the results
for name, model in models.items():
    print(f"Evaluating {name}")
    accuracy, conf_matrix, class_report = evaluate_model(model, X_train, X_test, y_train, y_test)
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("\n" + "="*80 + "\n")

Evaluating Logistic Regression
Accuracy: 0.77
Confusion Matrix:
[[ 20  36]
 [ 10 134]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.36      0.47        56
           1       0.79      0.93      0.85       144

    accuracy                           0.77       200
   macro avg       0.73      0.64      0.66       200
weighted avg       0.75      0.77      0.74       200



Evaluating Decision Tree
Accuracy: 0.68
Confusion Matrix:
[[ 24  32]
 [ 32 112]]
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.43      0.43        56
           1       0.78      0.78      0.78       144

    accuracy                           0.68       200
   macro avg       0.60      0.60      0.60       200
weighted avg       0.68      0.68      0.68       200



Evaluating Random Forest
Accuracy: 0.805
Confusion Matrix:
[[ 27  29]
 [ 10 134]]
Classification Report:
              precision

In [100]:
# from the above analysis we see that the accuracy of random forest is the highest