In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

df = pd.read_csv('Resources/loans_full_schema.csv')

In [2]:
column_info = []
object_df = pd.DataFrame()

for column in df.columns:
    if df[column].dtype == 'object':
      name = df[column].name
      object_df[name] = df[column].to_frame()
      column_data_type = df[column].dtype
      unique_values = df[column].nunique()
      sample_values = df[column].sample(5).values  # Get 5 sample values
      non_null = df[column].notnull().sum()
      null_values = df[column].isnull().sum()
      null_percent = df[column].isnull().sum()/len(df)  # percent nulls
      column_info.append({
          'Column Name': column,
          'Data Type': column_data_type,
          'Unique Values': unique_values,
          'Non Null': non_null,
          'Null Values': null_values,
          'Null Percent':  null_percent,
          'Sample Values': sample_values
      })

column_summary = pd.DataFrame(column_info)
num_rows, num_columns = object_df.shape

# Display the summary
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
print("\nColumn Information:")
print(column_summary)

Number of rows: 10000
Number of columns: 13

Column Information:
                  Column Name Data Type  Unique Values  Non Null  Null Values  \
0                   emp_title    object           4741      9167          833   
1                       state    object             50     10000            0   
2               homeownership    object              3     10000            0   
3             verified_income    object              3     10000            0   
4   verification_income_joint    object              3      1455         8545   
5                loan_purpose    object             12     10000            0   
6            application_type    object              2     10000            0   
7                       grade    object              7     10000            0   
8                   sub_grade    object             32     10000            0   
9                 issue_month    object              3     10000            0   
10                loan_status    object     

In [3]:
object_df['emp_title'].fillna(value="Other",inplace=True)

In [4]:
object_df = object_df.drop('verification_income_joint', axis=1)

In [5]:
object_df.head()

Unnamed: 0,emp_title,state,homeownership,verified_income,loan_purpose,application_type,grade,sub_grade,issue_month,loan_status,initial_listing_status,disbursement_method
0,global config engineer,NJ,MORTGAGE,Verified,moving,individual,C,C3,Mar-18,Current,whole,Cash
1,warehouse office clerk,HI,RENT,Not Verified,debt_consolidation,individual,C,C1,Feb-18,Current,whole,Cash
2,assembly,WI,RENT,Source Verified,other,individual,D,D1,Feb-18,Current,fractional,Cash
3,customer service,PA,RENT,Not Verified,debt_consolidation,individual,A,A3,Jan-18,Current,whole,Cash
4,security supervisor,CA,RENT,Verified,credit_card,joint,C,C3,Mar-18,Current,whole,Cash


In [6]:
object_df.isnull().sum() 

emp_title                 0
state                     0
homeownership             0
verified_income           0
loan_purpose              0
application_type          0
grade                     0
sub_grade                 0
issue_month               0
loan_status               0
initial_listing_status    0
disbursement_method       0
dtype: int64

In [7]:
column_info = []
numerical_df = pd.DataFrame()

for column in df.columns:
    if df[column].dtype == 'int64' or df[column].dtype == 'float64':
      name = df[column].name
      numerical_df[name] = df[column].to_frame()
      column_data_type = df[column].dtype
      unique_values = df[column].nunique()
      sample_values = df[column].sample(5).values  # Get 5 sample values
      non_null = df[column].notnull().sum()
      null_values = df[column].isnull().sum()
      null_percent = df[column].isnull().sum()/len(df)  # percent nulls
      column_info.append({
          'Column Name': column,
          'Data Type': column_data_type,
          'Unique Values': unique_values,
          'Non Null': non_null,
          'Null Values': null_values,
          'Null Percent':  null_percent,
          'Sample Values': sample_values
      })

column_summary = pd.DataFrame(column_info)
num_rows, num_columns = df.shape

# Display the summary
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
print("\nColumn Information:")
print(column_summary)

Number of rows: 10000
Number of columns: 55

Column Information:
                         Column Name Data Type  Unique Values  Non Null  \
0                         emp_length   float64             11      9183   
1                      annual_income   float64           1463     10000   
2                     debt_to_income   float64           3673      9976   
3                annual_income_joint   float64            596      1495   
4               debt_to_income_joint   float64           1189      1495   
5                          delinq_2y     int64             12     10000   
6           months_since_last_delinq   float64             97      4342   
7               earliest_credit_line     int64             53     10000   
8                 inquiries_last_12m     int64             26     10000   
9                 total_credit_lines     int64             78     10000   
10                 open_credit_lines     int64             45     10000   
11                total_credit_limi

In [8]:
numerical_df.isnull().sum()

emp_length                           817
annual_income                          0
debt_to_income                        24
annual_income_joint                 8505
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installment_accounts           0
accounts_opened_24m                    0
months_since_last_credit_inquiry    1271
num_satisfactory_accounts              0
num_accounts_120d_past_due           318
num_accounts_30d_past_due              0
num_active_debit

In [9]:
numerical_df = numerical_df.drop(columns=['annual_income_joint','debt_to_income_joint','months_since_last_delinq','months_since_90d_late'])
     

In [10]:
numerical_df.isnull().sum()

emp_length                           817
annual_income                          0
debt_to_income                        24
delinq_2y                              0
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
current_accounts_delinq                0
total_collection_amount_ever           0
current_installment_accounts           0
accounts_opened_24m                    0
months_since_last_credit_inquiry    1271
num_satisfactory_accounts              0
num_accounts_120d_past_due           318
num_accounts_30d_past_due              0
num_active_debit_accounts              0
total_debit_limit                      0
num_total_cc_accounts                  0
num_open_cc_accounts                   0
num_cc_carrying_

In [11]:
numerical_df = numerical_df.fillna(numerical_df.median())

In [12]:
numerical_df.isnull().sum()
     

emp_length                          0
annual_income                       0
debt_to_income                      0
delinq_2y                           0
earliest_credit_line                0
inquiries_last_12m                  0
total_credit_lines                  0
open_credit_lines                   0
total_credit_limit                  0
total_credit_utilized               0
num_collections_last_12m            0
num_historical_failed_to_pay        0
current_accounts_delinq             0
total_collection_amount_ever        0
current_installment_accounts        0
accounts_opened_24m                 0
months_since_last_credit_inquiry    0
num_satisfactory_accounts           0
num_accounts_120d_past_due          0
num_accounts_30d_past_due           0
num_active_debit_accounts           0
total_debit_limit                   0
num_total_cc_accounts               0
num_open_cc_accounts                0
num_cc_carrying_balance             0
num_mort_accounts                   0
account_neve

In [13]:
numerical_df.head()
     

Unnamed: 0,emp_length,annual_income,debt_to_income,delinq_2y,earliest_credit_line,inquiries_last_12m,total_credit_lines,open_credit_lines,total_credit_limit,total_credit_utilized,...,public_record_bankrupt,loan_amount,term,interest_rate,installment,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,3.0,90000.0,18.01,0,2001,6,28,10,70795,38767,...,0,28000,60,14.07,652.53,27015.86,1999.33,984.14,1015.19,0.0
1,10.0,40000.0,5.04,0,1996,1,30,14,28800,4321,...,1,5000,36,12.61,167.54,4651.37,499.12,348.63,150.49,0.0
2,3.0,40000.0,21.15,0,2006,4,31,10,24193,16000,...,0,2000,36,17.09,71.4,1824.63,281.8,175.37,106.43,0.0
3,1.0,30000.0,10.16,0,2007,0,4,4,25400,4997,...,0,21600,36,6.72,664.19,18853.26,3312.89,2746.74,566.15,0.0
4,10.0,35000.0,57.96,0,2008,7,22,16,69839,52722,...,0,23000,36,14.07,786.87,21430.15,2324.65,1569.85,754.8,0.0


In [14]:
numerical_df.shape

(10000, 38)

In [15]:
object_df.shape

(10000, 12)

In [16]:
df_concat = pd.concat([object_df,numerical_df],axis=1)

In [17]:
df_concat

Unnamed: 0,emp_title,state,homeownership,verified_income,loan_purpose,application_type,grade,sub_grade,issue_month,loan_status,...,public_record_bankrupt,loan_amount,term,interest_rate,installment,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,global config engineer,NJ,MORTGAGE,Verified,moving,individual,C,C3,Mar-18,Current,...,0,28000,60,14.07,652.53,27015.86,1999.33,984.14,1015.19,0.0
1,warehouse office clerk,HI,RENT,Not Verified,debt_consolidation,individual,C,C1,Feb-18,Current,...,1,5000,36,12.61,167.54,4651.37,499.12,348.63,150.49,0.0
2,assembly,WI,RENT,Source Verified,other,individual,D,D1,Feb-18,Current,...,0,2000,36,17.09,71.40,1824.63,281.80,175.37,106.43,0.0
3,customer service,PA,RENT,Not Verified,debt_consolidation,individual,A,A3,Jan-18,Current,...,0,21600,36,6.72,664.19,18853.26,3312.89,2746.74,566.15,0.0
4,security supervisor,CA,RENT,Verified,credit_card,joint,C,C3,Mar-18,Current,...,0,23000,36,14.07,786.87,21430.15,2324.65,1569.85,754.80,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,owner,TX,RENT,Source Verified,other,individual,A,A4,Jan-18,Current,...,1,24000,36,7.35,744.90,21586.34,2969.80,2413.66,556.14,0.0
9996,director,PA,MORTGAGE,Verified,debt_consolidation,individual,D,D3,Feb-18,Current,...,0,10000,36,19.03,366.72,9147.44,1456.31,852.56,603.75,0.0
9997,toolmaker,CT,MORTGAGE,Verified,debt_consolidation,joint,E,E2,Feb-18,Current,...,0,30000,36,23.88,1175.10,27617.65,4620.80,2382.35,2238.45,0.0
9998,manager,WI,MORTGAGE,Source Verified,other,individual,A,A1,Feb-18,Current,...,0,24000,36,5.32,722.76,21518.12,2873.31,2481.88,391.43,0.0


In [18]:
def categorize_risk(status):
    if status in ['Fully Paid', 'Current']:
        return 0
    elif status in ['In grace period', 'Late(31-120days)', 'Late(16-30days)']:
        return 1
    else:
        return 1

# Create new column 'Risk_Category' based on Loan_Status
df_concat['Risk_Category'] = df_concat['loan_status'].apply(categorize_risk)




In [19]:
df_concat.columns
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 51 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   emp_title                         10000 non-null  object 
 1   state                             10000 non-null  object 
 2   homeownership                     10000 non-null  object 
 3   verified_income                   10000 non-null  object 
 4   loan_purpose                      10000 non-null  object 
 5   application_type                  10000 non-null  object 
 6   grade                             10000 non-null  object 
 7   sub_grade                         10000 non-null  object 
 8   issue_month                       10000 non-null  object 
 9   loan_status                       10000 non-null  object 
 10  initial_listing_status            10000 non-null  object 
 11  disbursement_method               10000 non-null  object 
 12  emp_l

In [20]:
#df_concat.drop('loan_status', inplace=True, axis=1)


# Identify categorical variables in df_concat
categorical_variables = list(df_concat.dtypes[df_concat.dtypes == "object"].index)
categorical_numericals = list(df_concat.dtypes[df_concat.dtypes == "int64"].index)
display (categorical_variables)
display (categorical_numericals)

['emp_title',
 'state',
 'homeownership',
 'verified_income',
 'loan_purpose',
 'application_type',
 'grade',
 'sub_grade',
 'issue_month',
 'loan_status',
 'initial_listing_status',
 'disbursement_method']

['delinq_2y',
 'earliest_credit_line',
 'inquiries_last_12m',
 'total_credit_lines',
 'open_credit_lines',
 'total_credit_limit',
 'total_credit_utilized',
 'num_collections_last_12m',
 'num_historical_failed_to_pay',
 'current_accounts_delinq',
 'total_collection_amount_ever',
 'current_installment_accounts',
 'accounts_opened_24m',
 'num_satisfactory_accounts',
 'num_accounts_30d_past_due',
 'num_active_debit_accounts',
 'total_debit_limit',
 'num_total_cc_accounts',
 'num_open_cc_accounts',
 'num_cc_carrying_balance',
 'num_mort_accounts',
 'tax_liens',
 'public_record_bankrupt',
 'loan_amount',
 'term',
 'Risk_Category']

In [21]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse_output=False)
encoded_data = enc.fit_transform(df_concat[categorical_variables])

In [22]:
encoded_data

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [23]:
feature_names = enc.get_feature_names_out(categorical_variables)

# Create a new DataFrame with encoded data and appropriate column names
encoded_df = pd.DataFrame(
    encoded_data,
    columns=feature_names
)


In [24]:
encoded_df

Unnamed: 0,emp_title_ maintenance,emp_title_ admin,emp_title_ combo psc/hub,emp_title_ coordinator,emp_title_ director of engineering,emp_title_ district manager,emp_title_ freight worker,emp_title_ lead hydraulic mechanic,emp_title_ machine operator,emp_title_ quality control technician,...,loan_status_Charged Off,loan_status_Current,loan_status_Fully Paid,loan_status_In Grace Period,loan_status_Late (16-30 days),loan_status_Late (31-120 days),initial_listing_status_fractional,initial_listing_status_whole,disbursement_method_Cash,disbursement_method_DirectPay
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
scaled_numerical = scaler.fit_transform(numerical_df)

# Create a DataFrame for scaled numerical features
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_df.columns)

# Concatenate scaled numerical features with encoded features
df_scaled = pd.concat([encoded_df, scaled_numerical_df], axis=1)

In [26]:
df_scaled

Unnamed: 0,emp_title_ maintenance,emp_title_ admin,emp_title_ combo psc/hub,emp_title_ coordinator,emp_title_ director of engineering,emp_title_ district manager,emp_title_ freight worker,emp_title_ lead hydraulic mechanic,emp_title_ machine operator,emp_title_ quality control technician,...,public_record_bankrupt,loan_amount,term,interest_rate,installment,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,1.129752,1.516684,0.328439,0.598041,1.260223,-0.125038,-0.234361,0.803250,-0.065908
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.598804,-1.102945,-0.659333,0.036489,-1.046902,-0.984292,-0.504067,-0.397975,-0.868306,-0.065908
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,-1.394166,-0.659333,0.932336,-1.372981,-1.267985,-0.558973,-0.442581,-0.953479,-0.065908
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,0.508480,-0.659333,-1.141310,0.637589,0.441019,0.206834,0.219425,-0.064791,-0.065908
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,0.644383,-0.659333,0.328439,1.053683,0.699638,-0.042846,-0.083569,0.299889,-0.065908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.598804,0.741457,-0.659333,-1.015331,0.911333,0.715313,0.120152,0.133672,-0.084142,-0.065908
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,-0.617576,-0.659333,1.320269,-0.371342,-0.533063,-0.262233,-0.268237,0.007893,-0.065908
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,1.323900,-0.659333,2.290104,2.370445,1.320619,0.537278,0.125611,3.167942,-0.065908
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.367190,0.741457,-0.659333,-1.421262,0.836241,0.708466,0.095774,0.151236,-0.402544,-0.065908


In [27]:
df_scaled['Risk_Category'] = df_concat['Risk_Category'] 

In [28]:
# compute class weight

from sklearn.utils import compute_class_weight

X = df_scaled.drop(columns ='Risk_Category')
y = df_scaled['Risk_Category']

#split data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#compute class weights
class_weights = compute_class_weight('balanced', classes= np.unique(y_train), y=y_train)

# Create a dictionary to store the class weights
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Print the computed class weights
print("Class Weights:", class_weight_dict)




Class Weights: {0: 0.509090909090909, 1: 28.0}


## Use scaled dataframe to train Machine Learning models. RandomForestClassifier

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report


# RandomForest Classifier with class weights
rf_classifier = RandomForestClassifier(random_state=42, class_weight= {0:0.0138, 1:(1-0.0138)})
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)

# Evaluating RandomForest Classifier
print("RandomForest Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:")
print(classification_report(y_test, rf_predictions, zero_division=1))

# Initialize AdaBoostClassifier
adb_classifier = AdaBoostClassifier(random_state=42)

# Train AdaBoostClassifier
adb_classifier.fit(X_train, y_train)

# Make predictions on the test set
adb_predictions = adb_classifier.predict(X_test)

# Evaluate AdaBoostClassifier
adb_accuracy = accuracy_score(y_test, adb_predictions)
print("\nAdaBoost Accuracy:", adb_accuracy)
print("AdaBoost Classification Report:")
print(classification_report(y_test, adb_predictions, zero_division=1))


RandomForest Classifier Performance:
Accuracy: 0.9826666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2947
           1       1.00      0.02      0.04        53

    accuracy                           0.98      3000
   macro avg       0.99      0.51      0.51      3000
weighted avg       0.98      0.98      0.97      3000


AdaBoost Accuracy: 1.0
AdaBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2947
           1       1.00      1.00      1.00        53

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [30]:
# SVM 

from sklearn.svm import SVC
svm_clf = SVC(class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42)
# Train the SVM classifier
svm_clf.fit(X_train, y_train)
# Predictions
y_pred_svm = svm_clf.predict(X_test)


In [31]:
svm_testing_report = classification_report(y_test, y_pred_svm)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(svm_testing_report)


Accuracy: 0.9966666666666667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2947
           1       0.96      0.85      0.90        53

    accuracy                           1.00      3000
   macro avg       0.98      0.92      0.95      3000
weighted avg       1.00      1.00      1.00      3000



## Next Steps:

1. tuning
2. model validation
3. Neuro Network


Presentation
Slide 1: Title Slide

Title: Machine Learning Classification Project
Subtitle: Predicting Risk Categories
Your Name
Date
Slide 2: Introduction

Brief overview of the project
Objective: Predicting risk categories using machine learning algorithms
Slide 3: Dataset Overview

Description of the dataset used
Number of features and samples
Target variable: Risk Category
Slide 4: Data Preprocessing

Steps involved in data preprocessing
Handling missing values
Encoding categorical variables
Scaling numerical features
Slide 5: Exploratory Data Analysis (EDA)

Visualizations of key features
Distribution of target variable
Correlation analysis
Slide 6: Model Selection

Description of machine learning algorithms used
Random Forest Classifier
AdaBoost Classifier
SVM Classifier
Slide 7: Model Training

Splitting the data into training and testing sets
Training each model on the training set
Brief description of hyperparameters used
Slide 8: Model Evaluation

Evaluation metrics used: Accuracy, Precision, Recall, F1-Score
Performance comparison of each model
Confusion matrices for each model
Slide 9: Results and Conclusion

Summary of the findings
Best performing model
Insights gained from the analysis
Future directions for improvement
Slide 10: Thank You

Acknowledgment
