In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


In [42]:
train_set=pd.read_csv('train_jan.csv')
test_set=pd.read_csv('test_jan.csv')

In [43]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_ID                  164309 non-null  int64  
 1   Loan_Amount_Requested    164309 non-null  object 
 2   Length_Employed          156938 non-null  object 
 3   Home_Owner               138960 non-null  object 
 4   Annual_Income            139207 non-null  float64
 5   Income_Verified          164309 non-null  object 
 6   Purpose_Of_Loan          164309 non-null  object 
 7   Debt_To_Income           164309 non-null  float64
 8   Inquiries_Last_6Mo       164309 non-null  int64  
 9   Months_Since_Deliquency  75930 non-null   float64
 10  Number_Open_Accounts     164309 non-null  int64  
 11  Total_Accounts           164309 non-null  int64  
 12  Gender                   164309 non-null  object 
 13  Interest_Rate            164309 non-null  int64  
dtypes: f

In [44]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109541 entries, 0 to 109540
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_ID                  109541 non-null  int64  
 1   Loan_Amount_Requested    109541 non-null  object 
 2   Length_Employed          104605 non-null  object 
 3   Home_Owner               92830 non-null   object 
 4   Annual_Income            92643 non-null   float64
 5   Income_Verified          109541 non-null  object 
 6   Purpose_Of_Loan          109541 non-null  object 
 7   Debt_To_Income           109541 non-null  float64
 8   Inquiries_Last_6Mo       109541 non-null  int64  
 9   Months_Since_Deliquency  50682 non-null   float64
 10  Number_Open_Accounts     109541 non-null  int64  
 11  Total_Accounts           109541 non-null  int64  
 12  Gender                   109541 non-null  object 
dtypes: float64(3), int64(4), object(6)
memory usage: 10.9+ MB


In [45]:
test_set_loan=test_set.iloc[:,0]

In [46]:
train_set.isnull().sum()


Loan_ID                        0
Loan_Amount_Requested          0
Length_Employed             7371
Home_Owner                 25349
Annual_Income              25102
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

In [47]:
train_set.head(20)


Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1
5,10000006,4500,2 years,Rent,30000.0,VERIFIED - income source,credit_card,10.88,1,,12,15,Male,3
6,10000007,18075,4 years,Rent,85000.0,VERIFIED - income,debt_consolidation,5.65,0,,9,12,Male,3
7,10000008,15000,< 1 year,Rent,115000.0,VERIFIED - income,debt_consolidation,7.97,1,16.0,25,31,Female,2
8,10000009,6300,10+ years,Rent,,not verified,debt_consolidation,16.12,3,,8,9,Female,3
9,10000010,30000,10+ years,Own,163000.0,VERIFIED - income,debt_consolidation,11.13,0,,24,43,Male,1


In [48]:
train_set=train_set.drop('Loan_ID',axis=1)
test_set=test_set.drop('Loan_ID',axis=1)

In [49]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [50]:
train_set['Length_Employed'].fillna(method='pad',inplace=True)
test_set['Length_Employed'].fillna(method='pad',inplace=True)

In [51]:
train_set['Home_Owner'].fillna(method='pad',inplace=True)
test_set['Home_Owner'].fillna(method='pad',inplace=True)

In [52]:
train_set.isnull().sum()

Loan_Amount_Requested          0
Length_Employed                0
Home_Owner                     0
Annual_Income              25102
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

In [53]:
train_set['Annual_Income'].fillna(train_set['Annual_Income'].median(),inplace=True)
test_set['Annual_Income'].fillna(test_set['Annual_Income'].median(),inplace=True)

In [54]:
train_set.isnull().sum()

Loan_Amount_Requested          0
Length_Employed                0
Home_Owner                     0
Annual_Income                  0
Income_Verified                0
Purpose_Of_Loan                0
Debt_To_Income                 0
Inquiries_Last_6Mo             0
Months_Since_Deliquency    88379
Number_Open_Accounts           0
Total_Accounts                 0
Gender                         0
Interest_Rate                  0
dtype: int64

In [55]:
train_set['Gender']=le.fit_transform(train_set['Gender'])
test_set['Gender']=le.fit_transform(test_set['Gender'])


In [56]:
train_set.head(20)

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,0,1
1,30000,4 years,Mortgage,63000.0,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,0,3
2,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,1,3
3,16000,< 1 year,Mortgage,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,1,3
4,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,0,1
5,4500,2 years,Rent,30000.0,VERIFIED - income source,credit_card,10.88,1,,12,15,1,3
6,18075,4 years,Rent,85000.0,VERIFIED - income,debt_consolidation,5.65,0,,9,12,1,3
7,15000,< 1 year,Rent,115000.0,VERIFIED - income,debt_consolidation,7.97,1,16.0,25,31,0,2
8,6300,10+ years,Rent,63000.0,not verified,debt_consolidation,16.12,3,,8,9,0,3
9,30000,10+ years,Own,163000.0,VERIFIED - income,debt_consolidation,11.13,0,,24,43,1,1


In [57]:
train_set_income=pd.get_dummies(train_set['Income_Verified'],drop_first=True)
test_set_income=pd.get_dummies(test_set['Income_Verified'],drop_first=True)

In [58]:
train_set=pd.concat([train_set,train_set_income],axis=1)
test_set=pd.concat([test_set,test_set_income],axis=1)

In [59]:
test_set=test_set.drop('Income_Verified',axis=1)
train_set=train_set.drop('Income_Verified',axis=1)

In [62]:
train_set_home=pd.get_dummies(train_set['Home_Owner'],drop_first=True)
test_set_home=pd.get_dummies(test_set['Home_Owner'],drop_first=True)
train_set_purpose=pd.get_dummies(train_set['Purpose_Of_Loan'],drop_first=True)
test_set_purpose=pd.get_dummies(test_set['Purpose_Of_Loan'],drop_first=True)

In [63]:
train_set=pd.concat([train_set,train_set_home],axis=1)
test_set=pd.concat([test_set,test_set_home],axis=1)

In [64]:
train_set=pd.concat([train_set,train_set_purpose],axis=1)
test_set=pd.concat([test_set,test_set_purpose],axis=1)

In [65]:
test_set=test_set.drop('Home_Owner',axis=1)
train_set=train_set.drop('Home_Owner',axis=1)
test_set=test_set.drop('Purpose_Of_Loan',axis=1)
train_set=train_set.drop('Purpose_Of_Loan',axis=1)

In [66]:
train_set.describe()

Unnamed: 0,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate,VERIFIED - income source,not verified,...,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding
count,164309.0,164309.0,164309.0,75930.0,164309.0,164309.0,164309.0,164309.0,164309.0,164309.0,...,164309.0,164309.0,164309.0,164309.0,164309.0,164309.0,164309.0,164309.0,164309.0,164309.0
mean,71752.84,17.207189,0.781698,34.229356,11.193818,25.067665,0.713144,2.158951,0.322654,0.315704,...,0.056412,0.004705,0.020906,0.009379,0.005928,0.050795,0.000749,0.014558,0.005094,0.005039
std,55698.55,7.845083,1.034747,21.76118,4.991813,11.583067,0.452295,0.738364,0.467493,0.464797,...,0.230716,0.068428,0.143069,0.096389,0.076764,0.219579,0.02735,0.119775,0.071191,0.070809
min,4000.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48600.0,11.37,0.0,16.0,8.0,17.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,63000.0,16.84,0.0,31.0,10.0,23.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,82000.0,22.78,1.0,50.0,14.0,32.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7500000.0,39.99,8.0,180.0,76.0,156.0,1.0,3.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [67]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 33 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Loan_Amount_Requested     164309 non-null  object 
 1   Length_Employed           164309 non-null  object 
 2   Annual_Income             164309 non-null  float64
 3   Debt_To_Income            164309 non-null  float64
 4   Inquiries_Last_6Mo        164309 non-null  int64  
 5   Months_Since_Deliquency   75930 non-null   float64
 6   Number_Open_Accounts      164309 non-null  int64  
 7   Total_Accounts            164309 non-null  int64  
 8   Gender                    164309 non-null  int32  
 9   Interest_Rate             164309 non-null  int64  
 10  VERIFIED - income source  164309 non-null  uint8  
 11  not verified              164309 non-null  uint8  
 12  None                      164309 non-null  uint8  
 13  Other                     164309 non-null  u

In [68]:
train_set['deliq'] = train_set['Months_Since_Deliquency'].apply(lambda x: 1 if x>=0.0 else 0)

In [26]:
test_set['deliq'] = test_set['Months_Since_Deliquency'].apply(lambda x: 0 if x>=0.0 else 0)

In [27]:
train_set['Months_Since_Deliquency'] = train_set['Months_Since_Deliquency'].apply(lambda x: x if x>=0.0 else 0)
test_set['Months_Since_Deliquency'] = test_set['Months_Since_Deliquency'].apply(lambda x: x if x>=0.0 else 0)

In [69]:
train_set_length=pd.get_dummies(train_set['Length_Employed'],drop_first=True)
test_set_length=pd.get_dummies(test_set['Length_Employed'],drop_first=True)
train_set=pd.concat([train_set,train_set_length],axis=1)
test_set=pd.concat([test_set,test_set_length],axis=1)
test_set=test_set.drop('Length_Employed',axis=1)
train_set=train_set.drop('Length_Employed',axis=1)

In [70]:
train_set_Y=train_set['Interest_Rate']

In [71]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 43 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Loan_Amount_Requested     164309 non-null  object 
 1   Annual_Income             164309 non-null  float64
 2   Debt_To_Income            164309 non-null  float64
 3   Inquiries_Last_6Mo        164309 non-null  int64  
 4   Months_Since_Deliquency   75930 non-null   float64
 5   Number_Open_Accounts      164309 non-null  int64  
 6   Total_Accounts            164309 non-null  int64  
 7   Gender                    164309 non-null  int32  
 8   Interest_Rate             164309 non-null  int64  
 9   VERIFIED - income source  164309 non-null  uint8  
 10  not verified              164309 non-null  uint8  
 11  None                      164309 non-null  uint8  
 12  Other                     164309 non-null  uint8  
 13  Own                       164309 non-null  u

In [72]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164309 entries, 0 to 164308
Data columns (total 43 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Loan_Amount_Requested     164309 non-null  object 
 1   Annual_Income             164309 non-null  float64
 2   Debt_To_Income            164309 non-null  float64
 3   Inquiries_Last_6Mo        164309 non-null  int64  
 4   Months_Since_Deliquency   75930 non-null   float64
 5   Number_Open_Accounts      164309 non-null  int64  
 6   Total_Accounts            164309 non-null  int64  
 7   Gender                    164309 non-null  int32  
 8   Interest_Rate             164309 non-null  int64  
 9   VERIFIED - income source  164309 non-null  uint8  
 10  not verified              164309 non-null  uint8  
 11  None                      164309 non-null  uint8  
 12  Other                     164309 non-null  uint8  
 13  Own                       164309 non-null  u

In [73]:
train_set=train_set.drop('Interest_Rate',axis=1)

In [74]:
train_set['Loan_Amount_Requested']=train_set['Loan_Amount_Requested'].str.replace(',','').astype(int)
test_set['Loan_Amount_Requested']=test_set['Loan_Amount_Requested'].str.replace(',','').astype(int)


In [75]:
train_set_Y=np.array(train_set_Y)
train_set_Y=train_set_Y.ravel()


In [77]:
train_set.columns

Index(['Loan_Amount_Requested', 'Annual_Income', 'Debt_To_Income',
       'Inquiries_Last_6Mo', 'Months_Since_Deliquency', 'Number_Open_Accounts',
       'Total_Accounts', 'Gender', 'VERIFIED - income source', 'not verified',
       'None', 'Other', 'Own', 'Rent', 'None', 'Other', 'Own', 'Rent',
       'credit_card', 'debt_consolidation', 'educational', 'home_improvement',
       'house', 'major_purchase', 'medical', 'moving', 'other',
       'renewable_energy', 'small_business', 'vacation', 'wedding', 'deliq',
       '10+ years', '2 years', '3 years', '4 years', '5 years', '6 years',
       '7 years', '8 years', '9 years', '< 1 year'],
      dtype='object')

In [76]:

model2=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=5,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
model2.fit(train_set,train_set_Y)
train_set_pred_xg=model2.predict(train_set)

test_set_pred_xg=model2.predict(test_set)

ValueError: feature_names must be unique

In [40]:
print(f1_score(train_set_Y, train_set_pred_xg, average='weighted'))
data1={'Loan_ID':test_set_loan,'Interest_Rate':test_set_pred_xg}
data1=pd.DataFrame(data1)
data1.to_csv('xg_jan.csv',index=False)

0.5425195978230782


In [None]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
model2=XGBClassifier()
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
random_search=RandomizedSearchCV(model2,param_distributions=params,n_iter=5,scoring='f1_weighted',n_jobs=-1,cv=5,verbose=3)
random_search.fit(train_set,train_set_Y)
random_search.best_estimator_