In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
df = pd.read_csv('../../Data/loan.csv', low_memory = False)

In [4]:
df['target'] = np.where(df['loan_status'].isin(['Default','Charged Off','Does not meet the credit policy. Status:Charged Off']),1,0)
df.drop(['loan_status'], axis = 1, inplace = True)
df2 = df.copy()

In [5]:
X = df2.drop(['target'], axis = 1)
y = df2['target']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)

In [8]:
class DropNullFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,df):
        return self
    def transform(self,df):
        for col in df.columns:
            if df[col].isnull().mean() > 0.3:
                df.drop([col], axis = 1, inplace = True)
        print("Null features dropped")
        return df

In [9]:
class DropFeatures(BaseEstimator,TransformerMixin):
    def __init__(self,feature_to_drop = ['id','member_id','url','zip_code','emp_title','issue_d','addr_state','title','policy_code', 
                                        'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'collections_12_mths_ex_med',
                                        'acc_now_delinq', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'emp_length', 'earliest_cr_line']):
        self.feature_to_drop = feature_to_drop
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feature_to_drop).issubset(df.columns)):
            df.drop(self.feature_to_drop,axis=1,inplace=True)
            print("Irrelevant features dropped")
            return df
        else:
            print("One or more features are not in the dataframe 1")
            return df

In [10]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self,feat_with_outliers = ['int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
                                           'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
                                           'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt', 'tot_coll_amt', 
                                            'tot_cur_bal', 'total_rev_hi_lim']):
        self.feat_with_outliers = feat_with_outliers
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feat_with_outliers).issubset(df.columns)):
            # 25% quantile
            Q1 = df[self.feat_with_outliers].quantile(.25)
            # 75% quantile
            Q3 = df[self.feat_with_outliers].quantile(.75)
            IQR = Q3 - Q1
            # keep the data within 3 IQR
            df = df[~((df[self.feat_with_outliers] < (Q1 - 3 * IQR)) |(df[self.feat_with_outliers] > (Q3 + 3 * IQR))).any(axis=1)]
            print("Outliers removed")
            return df
        else:
            print("One or more features are not in the dataframe 2")
            print(df.columns)
            return df

In [11]:
class SkewnessHandler(BaseEstimator, TransformerMixin):
    def __init__(self,feat_with_skewness= ['annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
                                           'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
                                           'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt', 'tot_coll_amt', 
                                            'tot_cur_bal', 'total_rev_hi_lim']):
        self.feat_with_skewness = feat_with_skewness
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feat_with_skewness).issubset(df.columns)):
            # Handle skewness with cubic root transformation
            df[self.feat_with_skewness] = np.cbrt(df[self.feat_with_skewness])
            print("Skewness handled")
            return df
        else:
            print("One or more features are not in the dataframe 3")
            print(df.columns)
            return df

In [12]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 
                                 'initial_list_status', 'application_type']):
        self.columns = columns
        self.encoders = {}
    def fit(self, df, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(df[col])
            self.encoders[col] = le
        return self
    def transform(self, df):
        df_copy = df.copy()
        for col, encoder in self.encoders.items():
            df_copy[col] = encoder.transform(df[col])
        print("Lable encoding done")
        return df_copy

In [13]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns = ['int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                                  'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 
                                  'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']):
        self.columns = columns
        self.scalers = {}

    def fit(self, df, y=None):
        for col in self.columns:
            scaler = StandardScaler()
            scaler.fit(df[[col]])
            self.scalers[col] = scaler
        return self

    def transform(self, df):
        df_copy = df.copy()
        for col, scaler in self.scalers.items():
            df_copy[col] = scaler.transform(df[[col]])
        print("Scaling done")
        return df_copy

In [14]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                                  'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 
                                  'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']):
        self.columns = columns
        self.imputers = {}
        
    def fit(self, df, y=None):
        for col in self.columns:
            imputer = SimpleImputer()
            imputer.fit(df[[col]])
            self.imputers[col] = imputer
        return self

    def transform(self, df):
        df_copy = df.copy()
        for col, imputer in self.imputers.items():
            df_copy[col] = imputer.transform(df[[col]])
        print("Imuting done")
        return df_copy

In [15]:
from imblearn.over_sampling import ADASYN
class ADASYNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, target_column, random_state=None):
        self.target_column = target_column
        self.random_state = 87
        self.adasyn = ADASYN(random_state=random_state)
        
    def fit(self, X, y=None):
        # Determine the class to resample
        self.classes_, self.class_counts_ = np.unique(X[self.target_column], return_counts=True)
        self.class_to_resample_ = self.classes_[np.argmin(self.class_counts_)]
        return self
    
    def transform(self, X, y=None):
        # Extract features and target variable
        features = X.drop(columns=[self.target_column])
        target = X[self.target_column]
        
        # Apply ADASYN
        X_resampled, y_resampled = self.adasyn.fit_resample(features, target)
        
        # Combine resampled features and target into a DataFrame
        df_resampled = pd.DataFrame(X_resampled, columns=features.columns)
        df_resampled[self.target_column] = y_resampled
        print("Adaptive Synthetic Sampling done")
        return df_resampled


In [20]:
def full_pipeline_1(df):
    pipeline = Pipeline([
        ('drop_null_features', DropNullFeatures()),
        ('drop_features', DropFeatures()),
        ('missing_value_imputer', CustomImputer()),
        ('outlier_removal', OutlierRemover()),
        ('skewness_handler', SkewnessHandler()),
        ('label_encoder', CustomLabelEncoder()),
        ('standard_scaler', CustomStandardScaler()),
        # ('oversample', ADASYNTransformer('target'))
    ])
    df_train_new = pipeline.fit_transform(df_train)
    return df_train_new

In [21]:
df_train = pd.concat([X_train, y_train], axis = 1)

In [22]:
data_train = full_pipeline_1(df_train)

Null features dropped
Irrelevant features dropped
Imuting done
Outliers removed
Skewness handled
Lable encoding done
Scaling done


In [23]:
data_train.target.mean()

0.058193280183789844

In [14]:
# def full_pipeline_2(df):
#     pipeline = Pipeline([
#         ('drop_null_features', DropNullFeatures()),
#         ('drop_features', DropFeatures()),
#         ('missing_value_imputer', CustomImputer()),
#         ('outlier_removal', OutlierRemover()),
#         ('skewness_handler', SkewnessHandler()),
#         ('label_encoder', CustomLabelEncoder()),
#         # ('standard_scaler', CustomStandardScaler())
#     ])
    
#     df_new = pipeline.fit_transform(df)
#     return df_new

In [15]:
# data_no_outliers = full_pipeline_2(df2)

Null features dropped
Irrelevant features dropped
Imuting done
Outliers removed
Skewness handled
Lable encoding done
Scaling done


In [16]:
# data_no_outliers.shape

(365825, 34)

In [14]:
# X = data_outliers.drop(['target'], axis = 1)
# y = data_outliers['target']
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 43)

In [75]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
k = 5
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10)
kf = KFold(n_splits=k, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=kf)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))

Cross-validation scores: [0.98292285 0.98084837 0.98171771 0.98022894 0.98089307]
Mean CV score: 0.9813221906820144


In [15]:
# df_test = pd.concat([X_test, y_test], axis =1)

# def balance_subset(df, target_column):
#     positive_cases = df[df[target_column] == 1]
#     negative_cases = df[df[target_column] == 0]

#     num_samples_per_class = min(len(positive_cases), len(negative_cases))

#     balanced_subset = pd.concat([
#         positive_cases.sample(n=num_samples_per_class, replace=True, random_state=42),
#         negative_cases.sample(n=num_samples_per_class, replace=True, random_state=42)
#     ])

#     return balanced_subset

# balanced_subset_df = balance_subset(df_test, 'target')

In [16]:
# balanced_subset_df.shape

In [17]:
# X_balanced = balanced_subset_df.drop(['target'], axis = 1)
# y_balanced = balanced_subset_df['target']

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10)
X_train = data_train.drop(['target'], axis = 1)
y_train = data_train['target']
rf_classifier.fit(X_train,y_train)

In [26]:
rf_classifier.score(X_train, y_train)

0.9974496424029429

In [28]:
df_test = pd.concat([X_test, y_test], axis = 1)

In [29]:
data_test = full_pipeline_1(df_test)

Null features dropped
One or more features are not in the dataframe 1
Imuting done
Outliers removed
Skewness handled
Lable encoding done
Scaling done


In [34]:
X_test = data_test.drop(['target'], axis = 1)
y_test = data_test['target']

In [35]:
# rf_classifier.fit(X_train,y_train)
y_pred = rf_classifier.predict(X_test)

In [36]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print(accuracy_score(y_test,y_pred))

print(confusion_matrix(y_test,y_pred))

print(classification_report(y_test, y_pred))

0.9974496424029429
[[275486      0]
 [   746  16276]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    275486
           1       1.00      0.96      0.98     17022

    accuracy                           1.00    292508
   macro avg       1.00      0.98      0.99    292508
weighted avg       1.00      1.00      1.00    292508



In [37]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test,y_pred))

0.978087181294795


In [34]:
import graphviz
from sklearn.tree import export_graphviz

individual_tree = rf_classifier.estimators_[0]
dot_data = export_graphviz(individual_tree, out_file=None,
                                filled=True, rounded=True,  
                                special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("individual_tree")  

'individual_tree.pdf'