In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import roc_curve, auc, RocCurveDisplay, accuracy_score, f1_score,recall_score,precision_score
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
n=pd.read_csv('lending_club_loan_two.csv')
print(n.head(10))

   loan_amnt        term  int_rate  installment grade sub_grade  \
0    10000.0   36 months     11.44       329.48     B        B4   
1     8000.0   36 months     11.99       265.68     B        B5   
2    15600.0   36 months     10.49       506.97     B        B3   
3     7200.0   36 months      6.49       220.65     A        A2   
4    24375.0   60 months     17.27       609.33     C        C5   
5    20000.0   36 months     13.33       677.07     C        C3   
6    18000.0   36 months      5.32       542.07     A        A1   
7    13000.0   36 months     11.14       426.47     B        B2   
8    18900.0   60 months     10.99       410.84     B        B3   
9    26300.0   36 months     16.29       928.40     C        C5   

                       emp_title emp_length home_ownership  annual_inc  ...  \
0                      Marketing  10+ years           RENT    117000.0  ...   
1                Credit analyst     4 years       MORTGAGE     65000.0  ...   
2                   Stati

In [None]:
##### Feature Engineering #####

In [5]:
n['loan_status'].unique()

array(['Fully Paid', 'Charged Off'], dtype=object)

In [7]:
X=n.drop(['loan_status'],axis=1)
y=n['loan_status']

In [9]:
print(y)

0          Fully Paid
1          Fully Paid
2          Fully Paid
3          Fully Paid
4         Charged Off
             ...     
396025     Fully Paid
396026     Fully Paid
396027     Fully Paid
396028     Fully Paid
396029     Fully Paid
Name: loan_status, Length: 396030, dtype: object


In [11]:
n['term'].unique()

array([' 36 months', ' 60 months'], dtype=object)

In [13]:
n.isnull().sum()

loan_amnt                   0
term                        0
int_rate                    0
installment                 0
grade                       0
sub_grade                   0
emp_title               22927
emp_length              18301
home_ownership              0
annual_inc                  0
verification_status         0
issue_d                     0
loan_status                 0
purpose                     0
title                    1756
dti                         0
earliest_cr_line            0
open_acc                    0
pub_rec                     0
revol_bal                   0
revol_util                276
total_acc                   0
initial_list_status         0
application_type            0
mort_acc                37795
pub_rec_bankruptcies      535
address                     0
dtype: int64

In [14]:
df1=n.dropna()
df1.isnull().sum()

loan_amnt               0
term                    0
int_rate                0
installment             0
grade                   0
sub_grade               0
emp_title               0
emp_length              0
home_ownership          0
annual_inc              0
verification_status     0
issue_d                 0
loan_status             0
purpose                 0
title                   0
dti                     0
earliest_cr_line        0
open_acc                0
pub_rec                 0
revol_bal               0
revol_util              0
total_acc               0
initial_list_status     0
application_type        0
mort_acc                0
pub_rec_bankruptcies    0
address                 0
dtype: int64

In [16]:
df1['term'].unique()

array([' 36 months', ' 60 months'], dtype=object)

In [18]:
df1['term']=df1['term'].apply(lambda x: int(x.strip().split(' ')[0]))

In [20]:
df1['term']

0         36
1         36
2         36
3         36
4         60
          ..
396024    36
396025    60
396026    36
396027    36
396028    60
Name: term, Length: 335867, dtype: int64

In [23]:
print(df1)

        loan_amnt  term  int_rate  installment grade sub_grade  \
0         10000.0    36     11.44       329.48     B        B4   
1          8000.0    36     11.99       265.68     B        B5   
2         15600.0    36     10.49       506.97     B        B3   
3          7200.0    36      6.49       220.65     A        A2   
4         24375.0    60     17.27       609.33     C        C5   
...           ...   ...       ...          ...   ...       ...   
396024     6000.0    36     13.11       202.49     B        B4   
396025    10000.0    60     10.99       217.38     B        B4   
396026    21000.0    36     12.29       700.42     C        C1   
396027     5000.0    36      9.99       161.32     B        B1   
396028    21000.0    60     15.31       503.02     C        C2   

                      emp_title emp_length home_ownership  annual_inc  ...  \
0                     Marketing  10+ years           RENT    117000.0  ...   
1               Credit analyst     4 years       MO

In [25]:
df1['emp_length'].unique()

array(['10+ years', '4 years', '< 1 year', '6 years', '9 years',
       '2 years', '3 years', '8 years', '7 years', '5 years', '1 year'],
      dtype=object)

In [27]:
def fun1(x):
    try:
        x=x.strip()

        x=x.replace('+','').replace('years','').replace('<','').replace('year','').strip()

        return x
    except ValueError:
        print('Error')

In [29]:
df1['emp_length']=df1['emp_length'].apply(fun1)

In [31]:
df1['emp_length'].head(20)

0     10
1      4
2      1
3      6
4      9
5     10
6      2
7     10
8     10
9      3
10     2
11     8
12     7
13    10
14     9
15     8
16    10
17    10
18    10
19     7
Name: emp_length, dtype: object

In [33]:
df1.isnull().sum()

loan_amnt               0
term                    0
int_rate                0
installment             0
grade                   0
sub_grade               0
emp_title               0
emp_length              0
home_ownership          0
annual_inc              0
verification_status     0
issue_d                 0
loan_status             0
purpose                 0
title                   0
dti                     0
earliest_cr_line        0
open_acc                0
pub_rec                 0
revol_bal               0
revol_util              0
total_acc               0
initial_list_status     0
application_type        0
mort_acc                0
pub_rec_bankruptcies    0
address                 0
dtype: int64

In [34]:
df1.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'purpose', 'title',
       'dti', 'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'application_type',
       'mort_acc', 'pub_rec_bankruptcies', 'address'],
      dtype='object')

In [37]:
df1['issue_d']=pd.to_datetime(df1['issue_d'],format='%b-%Y')

In [39]:
print(df1['issue_d'])

0        2015-01-01
1        2015-01-01
2        2015-01-01
3        2014-11-01
4        2013-04-01
            ...    
396024   2013-03-01
396025   2015-10-01
396026   2015-02-01
396027   2013-10-01
396028   2012-08-01
Name: issue_d, Length: 335867, dtype: datetime64[ns]


In [41]:
df1['issue_d']

0        2015-01-01
1        2015-01-01
2        2015-01-01
3        2014-11-01
4        2013-04-01
            ...    
396024   2013-03-01
396025   2015-10-01
396026   2015-02-01
396027   2013-10-01
396028   2012-08-01
Name: issue_d, Length: 335867, dtype: datetime64[ns]

In [43]:
df1['earliest_cr_line']=pd.to_datetime(df1['earliest_cr_line'],format='%b-%Y')

In [45]:
df1['earliest_cr_line']

0        1990-06-01
1        2004-07-01
2        2007-08-01
3        2006-09-01
4        1999-03-01
            ...    
396024   1991-11-01
396025   2004-11-01
396026   2006-02-01
396027   1997-03-01
396028   1990-11-01
Name: earliest_cr_line, Length: 335867, dtype: datetime64[ns]

In [47]:
df1.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'purpose', 'title',
       'dti', 'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'application_type',
       'mort_acc', 'pub_rec_bankruptcies', 'address'],
      dtype='object')

In [49]:
df2=df1.drop(['title','sub_grade','verification_status','initial_list_status','revol_util','open_acc','pub_rec','mort_acc','address','issue_d','earliest_cr_line'],axis=1)

In [51]:
print(df2)

        loan_amnt  term  int_rate  installment grade                emp_title  \
0         10000.0    36     11.44       329.48     B                Marketing   
1          8000.0    36     11.99       265.68     B          Credit analyst    
2         15600.0    36     10.49       506.97     B             Statistician   
3          7200.0    36      6.49       220.65     A          Client Advocate   
4         24375.0    60     17.27       609.33     C  Destiny Management Inc.   
...           ...   ...       ...          ...   ...                      ...   
396024     6000.0    36     13.11       202.49     B  Michael's Arts & Crafts   
396025    10000.0    60     10.99       217.38     B         licensed bankere   
396026    21000.0    36     12.29       700.42     C                    Agent   
396027     5000.0    36      9.99       161.32     B             City Carrier   
396028    21000.0    60     15.31       503.02     C     Gracon Services, Inc   

       emp_length home_owne

In [53]:
######   PLK   ######    CONVERTING CATEGORICAL DATA INTO NUMERICAL DATA    ######   PLK ######

In [55]:
le=LabelEncoder()
df2['emp_title']=le.fit_transform(df2['emp_title'])
df2['home_ownership']=le.fit_transform(df2['home_ownership'])
df2['grade']=le.fit_transform(df2['grade'])
df2['loan_status']=le.fit_transform(df2['loan_status'])
df2['purpose']=le.fit_transform(df2['purpose'])
df2['application_type']=le.fit_transform(df2['application_type'])

In [56]:
print(df2)

        loan_amnt  term  int_rate  installment  grade  emp_title emp_length  \
0         10000.0    36     11.44       329.48      1      70215         10   
1          8000.0    36     11.99       265.68      1      28866          4   
2         15600.0    36     10.49       506.97      1     110940          1   
3          7200.0    36      6.49       220.65      0      23896          6   
4         24375.0    60     17.27       609.33      2      33212          9   
...           ...   ...       ...          ...    ...        ...        ...   
396024     6000.0    36     13.11       202.49      1      73210          5   
396025    10000.0    60     10.99       217.38      1     139501          2   
396026    21000.0    36     12.29       700.42      2       5084          5   
396027     5000.0    36      9.99       161.32      1      22540         10   
396028    21000.0    60     15.31       503.02      2      49459         10   

        home_ownership  annual_inc  loan_status  pu

In [59]:
X=df2.drop(['loan_status'],axis=1)
y=df2['loan_status']

In [61]:
print(y)

0         1
1         1
2         1
3         1
4         0
         ..
396024    1
396025    1
396026    1
396027    1
396028    1
Name: loan_status, Length: 335867, dtype: int32


In [77]:
x1=0
x2=0
for i in y:
    if i==0:
        x1+=1
    else:
        x2+=1
print(f'This is no.of 0 values:{x1}')
print(f'This is no.of 1 values:{x2}')

This is no.of 0 values:66312
This is no.of 1 values:269555


In [72]:
######   PLK   ######    SPLITING DATA INTO TRAIN & TEST    ######   PLK ######   

In [73]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [74]:
#print(X_train)

In [75]:
result={}

models={
    'linear':LogisticRegression(max_iter=500),
    'random':RandomForestClassifier(),
    'tree':DecisionTreeClassifier(),
    'svm':SVC(kernel='rbf',C=1,gamma='scale'),
    'neighbors':KNeighborsClassifier(),   
    'xgboost': xgb.XGBClassifier()

}



param_grids = {
        'linear': {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        },
    'random': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    'tree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'svm': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    },
    'neighbors': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'xgboost': {
            'n_estimators': [50, 100],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        }
}


In [None]:

smote=SMOTE(random_state=42)   

CV=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

for model_name,model in models.items():
    pipeline=Pipeline([
    ('imputer',SimpleImputer()),
    ('scaler',StandardScaler()),
    ('classification',model)
    ])

    param_grid = {f'classification__{key}': value for key, value in param_grids[model_name].items()}
    print(f'training the current model is {model_name} is running...')

    grid_params=GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=CV,
        scoring='accuracy'
    )
    grid_params.fit(X_train, y_train)
    result[model_name] = grid_params

    for model_name, grid_params in result.items():
        best_params = grid_params.best_params_
        best_score = grid_params.best_score_
        print(f"Model: {model_name}")
        print(f"Best Parameters: {best_params}")
        print(f"Best Score: {best_score:.4f}")
        print("-" * 40)
    

training the current model is linear is running...


###### 