In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
#from imblearn.datasets import make_imbalance

In [2]:
# loading data
data=pd.read_csv('train_fNxu4vz.csv')
test=pd.read_csv('test_fjtUOL8.csv')
data.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [3]:
# concanating test and train to fill missing values
data3=pd.concat((data,test),axis=0)
# resetting index 
data3=data3.reset_index()
# droping columns
data3=data3.drop(columns='index')

In [4]:
# filling null with most frequent class
data3['Length_Employed']=data3['Length_Employed'].fillna('10+ years')
data3['Home_Owner']=data3['Home_Owner'].fillna('Mortgage')

In [5]:
# data skewed using log value
data3['Annual_Income']=np.log(data3['Annual_Income'])
data3['Annual_Income'].describe()

count    231850.000000
mean         11.058225
std           0.520884
min           8.006368
25%          10.714418
50%          11.050890
75%          11.396392
max          15.830414
Name: Annual_Income, dtype: float64

In [6]:
# filling null value with median
data3['Annual_Income']=data3['Annual_Income'].fillna(11.050890)

In [7]:
# taking sqrt to transform data
data3['Months_Since_Deliquency']=np.sqrt(data3['Months_Since_Deliquency'])
# filling null with zero assumed zero represent no deliquency
data3['Months_Since_Deliquency']=data3['Months_Since_Deliquency'].fillna(0)

In [8]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273850 entries, 0 to 273849
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Loan_ID                  273850 non-null  int64  
 1   Loan_Amount_Requested    273850 non-null  object 
 2   Length_Employed          273850 non-null  object 
 3   Home_Owner               273850 non-null  object 
 4   Annual_Income            273850 non-null  float64
 5   Income_Verified          273850 non-null  object 
 6   Purpose_Of_Loan          273850 non-null  object 
 7   Debt_To_Income           273850 non-null  float64
 8   Inquiries_Last_6Mo       273850 non-null  int64  
 9   Months_Since_Deliquency  273850 non-null  float64
 10  Number_Open_Accounts     273850 non-null  int64  
 11  Total_Accounts           273850 non-null  int64  
 12  Gender                   273850 non-null  object 
 13  Interest_Rate            164309 non-null  float64
dtypes: f

In [9]:
data3['Loan_Amount_Requested']=data3['Loan_Amount_Requested'].str.replace(',','')
data3['Loan_Amount_Requested']=data3['Loan_Amount_Requested'].astype('int64')


In [10]:
data3['Length_Employed']=LabelEncoder().fit_transform(data3['Length_Employed'])
data3['Home_Owner']=LabelEncoder().fit_transform(data3['Home_Owner'])
data3['Income_Verified']=LabelEncoder().fit_transform(data3['Income_Verified'])
data3['Purpose_Of_Loan']=LabelEncoder().fit_transform(data3['Purpose_Of_Loan'])
data3['Gender']=LabelEncoder().fit_transform(data3['Gender'])

In [11]:
data3=data3.drop(columns='Loan_ID')
data3.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,7000,10,4,11.127263,2,0,18.37,0,0.0,9,14,0,1.0
1,30000,4,0,11.05089,0,2,14.93,0,4.123106,12,24,0,3.0
2,24725,7,0,11.232767,1,2,15.88,0,0.0,12,16,1,3.0
3,16000,10,0,10.93596,1,2,14.34,3,0.0,16,22,1,3.0
4,17000,8,3,11.472103,1,2,22.17,1,0.0,19,30,0,1.0


In [12]:
# normalizing data
for header in ['Loan_Amount_Requested','Annual_Income','Debt_To_Income','Inquiries_Last_6Mo','Months_Since_Deliquency','Number_Open_Accounts','Total_Accounts']:
    data3[header]=(data3[header]-np.mean(data3[header]))/(np.std(data3[header]))
    
# changing type to category from object
for header in ['Length_Employed','Home_Owner','Income_Verified','Purpose_Of_Loan','Gender',]:
    data3[header]=data3[header].astype('int64')

train=data3[0:164309]
test=data3[164309:]

y=train['Interest_Rate'].copy()

train=train.drop(columns='Interest_Rate')
test=test.drop(columns='Interest_Rate')

#train=pd.get_dummies(train)
#test=pd.get_dummies(test)

In [13]:
y.value_counts()

2.0    70580
3.0    59923
1.0    33806
Name: Interest_Rate, dtype: int64

# over sampling using smote to deal class imbalance

In [14]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE({1:70600,2:70600,3:70600},random_state=40).fit_resample(train, y)
print(sorted(Counter(y_resampled).items()))

[(1.0, 70600), (2.0, 70600), (3.0, 70600)]


In [15]:
import lightgbm as lgb
from sklearn.metrics import f1_score

In [16]:
# splitting the data
xtrain,xval,ytrain,yval=train_test_split(X_resampled, y_resampled-1,test_size=0.2,random_state=42)

In [17]:
ytrain

187831    0.0
178009    0.0
125255    2.0
47300     2.0
43774     1.0
         ... 
119879    0.0
103694    1.0
131932    2.0
146867    2.0
121958    2.0
Name: Interest_Rate, Length: 169440, dtype: float64

In [18]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(xtrain, ytrain)
lgb_eval = lgb.Dataset(xval, yval, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'num_class':3,
    'max_depth':-1,
    
}

print('Starting training...')
# train
gbm = lgb.train(params,lgb_train,num_boost_round=5000,valid_sets=lgb_eval,early_stopping_rounds=5)
             






Starting training...
[1]	valid_0's multi_logloss: 1.09135
Training until validation scores don't improve for 5 rounds
[2]	valid_0's multi_logloss: 1.08421
[3]	valid_0's multi_logloss: 1.07758
[4]	valid_0's multi_logloss: 1.07157
[5]	valid_0's multi_logloss: 1.06554
[6]	valid_0's multi_logloss: 1.05972
[7]	valid_0's multi_logloss: 1.0544
[8]	valid_0's multi_logloss: 1.04921
[9]	valid_0's multi_logloss: 1.04458
[10]	valid_0's multi_logloss: 1.0399
[11]	valid_0's multi_logloss: 1.0355
[12]	valid_0's multi_logloss: 1.0312
[13]	valid_0's multi_logloss: 1.02726
[14]	valid_0's multi_logloss: 1.02343
[15]	valid_0's multi_logloss: 1.01984
[16]	valid_0's multi_logloss: 1.01621
[17]	valid_0's multi_logloss: 1.01289
[18]	valid_0's multi_logloss: 1.00959
[19]	valid_0's multi_logloss: 1.00647
[20]	valid_0's multi_logloss: 1.00363
[21]	valid_0's multi_logloss: 1.00089
[22]	valid_0's multi_logloss: 0.998237
[23]	valid_0's multi_logloss: 0.995839
[24]	valid_0's multi_logloss: 0.993381
[25]	valid_0's mu

[210]	valid_0's multi_logloss: 0.851436
[211]	valid_0's multi_logloss: 0.851262
[212]	valid_0's multi_logloss: 0.851101
[213]	valid_0's multi_logloss: 0.850993
[214]	valid_0's multi_logloss: 0.850794
[215]	valid_0's multi_logloss: 0.850644
[216]	valid_0's multi_logloss: 0.850458
[217]	valid_0's multi_logloss: 0.850171
[218]	valid_0's multi_logloss: 0.849991
[219]	valid_0's multi_logloss: 0.849729
[220]	valid_0's multi_logloss: 0.849453
[221]	valid_0's multi_logloss: 0.849329
[222]	valid_0's multi_logloss: 0.849165
[223]	valid_0's multi_logloss: 0.84901
[224]	valid_0's multi_logloss: 0.848899
[225]	valid_0's multi_logloss: 0.848798
[226]	valid_0's multi_logloss: 0.848628
[227]	valid_0's multi_logloss: 0.848485
[228]	valid_0's multi_logloss: 0.848374
[229]	valid_0's multi_logloss: 0.8482
[230]	valid_0's multi_logloss: 0.848105
[231]	valid_0's multi_logloss: 0.847775
[232]	valid_0's multi_logloss: 0.847481
[233]	valid_0's multi_logloss: 0.847244
[234]	valid_0's multi_logloss: 0.846928
[23

[422]	valid_0's multi_logloss: 0.825564
[423]	valid_0's multi_logloss: 0.825473
[424]	valid_0's multi_logloss: 0.825426
[425]	valid_0's multi_logloss: 0.825388
[426]	valid_0's multi_logloss: 0.825306
[427]	valid_0's multi_logloss: 0.825276
[428]	valid_0's multi_logloss: 0.825208
[429]	valid_0's multi_logloss: 0.82512
[430]	valid_0's multi_logloss: 0.825043
[431]	valid_0's multi_logloss: 0.824841
[432]	valid_0's multi_logloss: 0.824671
[433]	valid_0's multi_logloss: 0.824555
[434]	valid_0's multi_logloss: 0.824449
[435]	valid_0's multi_logloss: 0.824341
[436]	valid_0's multi_logloss: 0.824324
[437]	valid_0's multi_logloss: 0.824298
[438]	valid_0's multi_logloss: 0.824281
[439]	valid_0's multi_logloss: 0.824258
[440]	valid_0's multi_logloss: 0.824227
[441]	valid_0's multi_logloss: 0.824163
[442]	valid_0's multi_logloss: 0.824094
[443]	valid_0's multi_logloss: 0.824055
[444]	valid_0's multi_logloss: 0.823984
[445]	valid_0's multi_logloss: 0.82396
[446]	valid_0's multi_logloss: 0.823821
[4

[630]	valid_0's multi_logloss: 0.814029
[631]	valid_0's multi_logloss: 0.814006
[632]	valid_0's multi_logloss: 0.813967
[633]	valid_0's multi_logloss: 0.813926
[634]	valid_0's multi_logloss: 0.813884
[635]	valid_0's multi_logloss: 0.813858
[636]	valid_0's multi_logloss: 0.813841
[637]	valid_0's multi_logloss: 0.813824
[638]	valid_0's multi_logloss: 0.813797
[639]	valid_0's multi_logloss: 0.813778
[640]	valid_0's multi_logloss: 0.813743
[641]	valid_0's multi_logloss: 0.813724
[642]	valid_0's multi_logloss: 0.813704
[643]	valid_0's multi_logloss: 0.813669
[644]	valid_0's multi_logloss: 0.813639
[645]	valid_0's multi_logloss: 0.813611
[646]	valid_0's multi_logloss: 0.813593
[647]	valid_0's multi_logloss: 0.813574
[648]	valid_0's multi_logloss: 0.813565
[649]	valid_0's multi_logloss: 0.81356
[650]	valid_0's multi_logloss: 0.813549
[651]	valid_0's multi_logloss: 0.813481
[652]	valid_0's multi_logloss: 0.813404
[653]	valid_0's multi_logloss: 0.813321
[654]	valid_0's multi_logloss: 0.813269
[

[837]	valid_0's multi_logloss: 0.808817
[838]	valid_0's multi_logloss: 0.808805
[839]	valid_0's multi_logloss: 0.808793
[840]	valid_0's multi_logloss: 0.808789
[841]	valid_0's multi_logloss: 0.808737
[842]	valid_0's multi_logloss: 0.808692
[843]	valid_0's multi_logloss: 0.808644
[844]	valid_0's multi_logloss: 0.808617
[845]	valid_0's multi_logloss: 0.808579
[846]	valid_0's multi_logloss: 0.808541
[847]	valid_0's multi_logloss: 0.808532
[848]	valid_0's multi_logloss: 0.808495
[849]	valid_0's multi_logloss: 0.808469
[850]	valid_0's multi_logloss: 0.808466
[851]	valid_0's multi_logloss: 0.808441
[852]	valid_0's multi_logloss: 0.808425
[853]	valid_0's multi_logloss: 0.808423
[854]	valid_0's multi_logloss: 0.808424
[855]	valid_0's multi_logloss: 0.808417
[856]	valid_0's multi_logloss: 0.808412
[857]	valid_0's multi_logloss: 0.808407
[858]	valid_0's multi_logloss: 0.808406
[859]	valid_0's multi_logloss: 0.808397
[860]	valid_0's multi_logloss: 0.808388
[861]	valid_0's multi_logloss: 0.808351


In [19]:
y_pred = gbm.predict(xval, num_iteration=gbm.best_iteration,axis=1)
# eval


In [20]:
y_pred=np.argmax(y_pred,axis=1)
y_pred

array([1, 2, 2, ..., 1, 0, 0], dtype=int64)

In [21]:
f1_score(yval,y_pred,average='weighted')

0.6120142782304377

In [22]:
ypredict=gbm.predict(test,num_iteration=gbm.best_iteration,axis=1)
ypredict=np.argmax(ypredict,axis=1)
ypredict=ypredict+1
ypredict

array([2, 1, 3, ..., 2, 3, 2], dtype=int64)

In [23]:
submission=pd.read_csv('sample_submission_HSqiq1Q.csv')
submission['Interest_Rate']=ypredict
submission=submission.set_index('Loan_ID',drop=True)
submission
sub=submission.to_csv('sol1.csv')