In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sklearn

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score,train_test_split
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
train = pd.read_csv("/kaggle/input/jantahack/train_fNxu4vz.csv")
test = pd.read_csv("/kaggle/input/jantahack/test_fjtUOL8.csv")
subm = pd.read_csv("/kaggle/input/jantahack/sample_submission_HSqiq1Q.csv")

In [None]:
train_df = train.copy()
test_df = test.copy()
joined_df = pd.concat([train_df,test_df],axis=0)
joined_df = joined_df.reset_index(drop=True)

In [None]:
## Basic Pre-processed Data
joined_df = pd.read_csv("../input/preprocessed/preprocess/combined_basic.csv")


## Lets separate train and test and Label Encode and dummy encode and try which models best
train_df = joined_df[:164309]
test_df = joined_df[164309:]
train_df.drop("Loan_ID",axis=1,inplace=True)
test_df.drop(["Loan_ID","Interest_Rate"],axis=1,inplace=True)
print(train_df.shape,test_df.shape)

X_train,y_train = train_df.drop("Interest_Rate",1),train_df.loc[:,"Interest_Rate"]
print(X_train.shape,y_train.shape)

# Hyper-parameter tuning

In [None]:
## XGboost baseline model and its params
xgb_obj = XGBClassifier(
                learning_rate=0.300000012,
                n_estimators=100,
                max_depth=6,
                min_child_weight=1,
                gamma=0, 
                subsample=1,
                colsample_bytree=1,
                objective='multi:softprob', 
                random_state=21, n_jobs=-1,
                scale_pos_weight=None, 
                verbosity=3,
                seed=21
              )

## Tuning Max Depth and min_child_wt
- As these are the parameters which will affect XGBoost the most as they control the tree structure.Hence tuning them first.Leaving other parameters as default


## Fine Tunings the above findings

In [None]:
params = {
 'max_depth':[5,6,7],
 'min_child_weight':[6,8,10,12,14]
}

model = XGBClassifier(
              colsample_bytree=0.8, gamma=0,learning_rate=0.300000012, max_depth=6,
              min_child_weight=1,n_estimators=100, n_jobs=-1,objective='multi:softprob', 
              subsample=0.8,verbosity=3,seed=21)


gsearch = GridSearchCV(estimator = model, param_grid = params, scoring='f1_weighted',n_jobs=-1, cv=4,return_train_score=True,verbose=7)

gsearch.fit(X_train,y_train)
print(f'Best scor = {gsearch.best_score_} and fixing the params as {gsearch.best_params_}')
params = gsearch.cv_results_["params"]
test_f1= gsearch.cv_results_["mean_test_score"]
train_f1 = gsearch.cv_results_["mean_train_score"]
df = pd.DataFrame({"xaxis":params,"train":train_f1,"test":test_f1})
df = df.sort_values(by="test",ascending=False).reset_index(drop=True)
df[:10]

## Best scor = 0.5315695946969097 and fixing the params as {'max_depth': 6, 'min_child_weight': 14}


## Tuning Gamma

In [None]:
params = {
 'gamma':[i/10.0 for i in range(0,5)]
}

xgb_obj = XGBClassifier(
                learning_rate=0.300000012,
                n_estimators=100,
                subsample=1,
                colsample_bytree=1,
                objective='multi:softprob', 
                random_state=21, n_jobs=-1,
                scale_pos_weight=None, 
                verbosity=3,
                max_depth=6,
                min_child_weight =14,
                seed=21
              )

rscv = RandomizedSearchCV(xgb_obj,params,random_state=21,cv=4,verbose=4,n_jobs=-1,return_train_score=True,scoring='f1_weighted')
rscv.fit(X_train,y_train)

x = [i/10.0 for i in range(0,5)]
test_f1= rscv.cv_results_["mean_test_score"]
train_f1 = rscv.cv_results_["mean_train_score"]
print(f'Best scor = {rscv.best_score_} and fixing the params as {rscv.best_params_}')

df = pd.DataFrame({"xaxis":x,"train":train_f1,"test":test_f1})
fig = go.Figure()

fig.add_trace(go.Scatter(x=df["xaxis"], y=df["train"],
                    mode='lines+markers',
                    name='train_mse'))
fig.add_trace(go.Scatter(x=df["xaxis"], y=df["test"],
                    mode='lines+markers',
                    name='test_mse'))
fig.show()

## Best scor = 0.531569862841758 and fixing the params as {'gamma': 0.3}

## Tuning Subasmple and colsample by tree


In [None]:
params = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
model = XGBClassifier( 
              colsample_bytree=0.8, gamma=0,learning_rate=0.300000012, max_depth=6,
              min_child_weight=14,n_estimators=100, n_jobs=-1,objective='multi:softprob', 
              subsample=0.8,verbosity=3 scale_pos_weight=1,seed=21)


gsearch = GridSearchCV(estimator = model, param_grid = params, scoring='f1_weighted',n_jobs=-1, cv=4,return_train_score=True,verbose=7)
gsearch.fit(X_train,y_train)

print(f'Best scor = {gsearch.best_score_} and fixing the params as {gsearch.best_params_}')
params = gsearch.cv_results_["params"]
test_f1= gsearch.cv_results_["mean_test_score"]
train_f1 = gsearch.cv_results_["mean_train_score"]
df = pd.DataFrame({"xaxis":params,"train":train_f1,"test":test_f1})
df = df.sort_values(by="test",ascending=False).reset_index(drop=True)
df[:10]


##  Here after finding a value they are trying more precise values i.e fine tuning

# Here, we found 0.8 as the optimum value for both subsample and colsample_bytree. 


## Tuning learning rate

In [None]:
params = {
 'learning_rate':[0.150000006,0.05,0.5,0.1,0.2],
  'n_estimators':[200,600,60,300,150]
}

xgb_obj = XGBClassifier(
                subsample=1,
                colsample_bytree=1,
                objective='multi:softprob', 
                random_state=21, n_jobs=-1,
                scale_pos_weight=None, 
                verbosity=3,
                max_depth=6,
                min_child_weight =14,
                gamma = 0.3,seed=21
              )

rscv = RandomizedSearchCV(xgb_obj,params,random_state=21,cv=4,verbose=4,n_jobs=-1,return_train_score=True,scoring='f1_weighted')
rscv.fit(X_train,y_train)

# lr = 0.2
# n_estimator=150

# Binning Preprocessing

In [None]:
train = pd.read_csv("data-loan-cat/train_fNxu4vz.csv")
test = pd.read_csv("data-loan-cat/test_fjtUOL8.csv")
subm = pd.read_csv("data-loan-cat/sample_submission_HSqiq1Q.csv")


train_df,test_df = train.copy(),test.copy()
joined_df = pd.concat([train_df,test_df],axis=0)
joined_df.reset_index(drop=True,inplace=True)
print(f'OG Train shape ={train_df.shape}\nOG Test shape ={test_df.shape}\nOG Join shape ={joined_df.shape}')

## Coverting to int 
joined_df["Loan_Amount_Requested"]= joined_df["Loan_Amount_Requested"].str.replace('\D+','').astype(int)


## Numerical and cat cols
numerical = [col for col in joined_df.columns if joined_df[col].dtype!='object']
categorical = [col for col in joined_df.columns if joined_df[col].dtype =='object']
# print(*numerical,sep=" ")
# print("--------------------------------------------\n")
# print(*categorical,sep=" ")


# ## Imuting missing values
joined_df["Length_Employed"].fillna(joined_df["Length_Employed"].mode()[0],inplace=True)
joined_df["Home_Owner"].fillna(joined_df["Home_Owner"].mode()[0],inplace=True)
joined_df["Annual_Income"].fillna(joined_df["Annual_Income"].median(),inplace=True)
joined_df["Months_Since_Deliquency"].fillna(joined_df["Months_Since_Deliquency"].median(),inplace=True)


## Dropping Id
joined_df.drop("Loan_ID",axis=1,inplace=True)
print(f'Shape after dropping loan id {joined_df.shape}')

## Binning  length employeed
mapping = {'< 1 year':"less_than4",
           '2 years ':"less_than4",
           '3 years ':"less_than4",
           '4 years':"between_4to8",
           '5 years':"between_4to8",
           '6 years' : "between_4to8",
           '7 years':"between_4to8",
           '8 years':'greater_than8',
           "9 years":'greater_than8',
           "10+ years":'greater_than8'
    
}

joined_df['Length_Employed'] = joined_df['Length_Employed'].map(mapping)
print(joined_df["Length_Employed"].value_counts())


## Binning home owner
mapping = {'Mortgage':"Mortgage",
           'Rent ':"Rent",
           'Own':"Own",
           'Other':"Other",
           'None':"Other"
           }

joined_df['Home_Owner'] = joined_df['Home_Owner'].map(mapping)
print(joined_df["Home_Owner"].value_counts())

## One Hot ENcode
joined_df = pd.concat([ 
            joined_df.select_dtypes(exclude='object'),
            pd.get_dummies(joined_df['Length_Employed'],drop_first = True),
            pd.get_dummies(joined_df['Home_Owner'],drop_first = True),
            pd.get_dummies(joined_df['Income_Verified'],drop_first = True),
            pd.get_dummies(joined_df['Purpose_Of_Loan'],drop_first = True),
            pd.get_dummies(joined_df['Gender'],drop_first = True)
            
            ],axis=1)

print(f'Shape after One HOt encode {joined_df.shape}')
joined_df.to_csv("combined_only_binning.csv",index=False)
print("Only binned saved !!")

## Polynomial Features

In [None]:
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

def create_ploynomial_interactions(df):
    print(f'Old shape of the df {df.shape}\n')
    
    combos = list(combinations(list(df.columns), 2))
    print(f'combination  = {len(combos)}\n')
    
    colnames = list(df.columns)+['_'.join(x) for x in combos]
    print(f'Total columns now = {len(colnames)}\n')
    
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    print(f'New shape of the df after creating new polynomial features = {df.shape}\n')
    df = pd.DataFrame(df)
    df.columns = colnames
    
    ## Dropping the columns which contains all zero as their values i.e column filled with zeroes
    noint_indices = [col_name for col_name,contains_zero in enumerate(list((df==0).all())) if contains_zero]
    df= df.drop(df.columns[noint_indices], axis=1)
    print(f'Final shape of the df after removing zero columns = {df.shape}\n')

    
    return df,colnames

X = joined_df.drop("Interest_Rate",1)
t = X.copy()
y = joined_df.loc[:,"Interest_Rate"]
X,old_cols= create_ploynomial_interactions(X)
print(X.shape)

joined_df = pd.concat([X,y],axis=1)
print(f'Final join shape {joined_df.shape}')
joined_df.to_csv("binning_poly.csv",index=False)


In [None]:
## Let's separate train and test 
train_df = joined_df[:164309]
test_df = joined_df[164309:]

test_df.reset_index(drop=True,inplace=True) # since indexes were not starting from zero
test_df.drop(["Interest_Rate"],axis=1,inplace=True)
print(f'Final Train shape {train_df.shape} \nFinal Test Shape {test_df.shape}')

## Polynomial Dataset
X_train,y_train = train_df.drop("Interest_Rate",1),train_df.loc[:,"Interest_Rate"]
print(f'Training shape {X_train.shape,y_train.shape}')

In [None]:
## Only Binned Dataset

only_binned = pd.read_csv("combined_only_binning.csv")
only_binned_train = only_binned[:164309]

only_binned_test = only_binned[164309:]
only_binned_test.reset_index(drop=True,inplace=True)
only_binned_test.drop("Interest_Rate",axis=1,inplace=True)

ob_Xtrain , ob_ytrain = only_binned_train.drop("Interest_Rate",1),only_binned_train.loc[:,"Interest_Rate"]
final_Xtrain = pd.concat([ob_Xtrain,X_train[columns]],axis=1)
final_Xtest = pd.concat([only_binned_test,test_df[columns]],axis=1)

print(ob_Xtrain.shape[1]+100,only_binned_test.shape[1]+100)
print(final_Xtrain.shape,final_Xtest.shape)



## Modelling
## feature_names must be unique Due to duplicate columns
final_Xtrain = final_Xtrain.loc[:,~final_Xtrain.columns.duplicated()]
final_Xtest = final_Xtest.loc[:,~final_Xtest.columns.duplicated()]


In [None]:
columns = ['Loan_Amount_Requested_Annual_Income',
 'Loan_Amount_Requested_Months_Since_Deliquency',
 'Loan_Amount_Requested_Inquiries_Last_6Mo',
 'Loan_Amount_Requested_Debt_To_Income',
 'Loan_Amount_Requested_Number_Open_Accounts',
 'vacation',
 'Loan_Amount_Requested_greater_than8',
 'Loan_Amount_Requested_Own',
 'Loan_Amount_Requested_house',
 'Loan_Amount_Requested_less_than4',
 'Loan_Amount_Requested_Other',
 'Loan_Amount_Requested_Total_Accounts',
 'major_purchase',
 'Months_Since_Deliquency',
 'Loan_Amount_Requested_debt_consolidation',
 'Loan_Amount_Requested',
 'Loan_Amount_Requested_credit_card',
 'Loan_Amount_Requested_Male',
 'Annual_Income_moving',
 'Loan_Amount_Requested_home_improvement',
 'Loan_Amount_Requested_educational',
 'Loan_Amount_Requested_not verified',
 'Annual_Income_Total_Accounts',
 'Loan_Amount_Requested_VERIFIED - income source',
 'Loan_Amount_Requested_medical',
 'Loan_Amount_Requested_moving',
 'Annual_Income_Other',
 'Loan_Amount_Requested_renewable_energy',
 'Annual_Income_Debt_To_Income',
 'Annual_Income_less_than4',
 'Loan_Amount_Requested_vacation',
 'Annual_Income_greater_than8',
 'Annual_Income_not verified',
 'Annual_Income_Number_Open_Accounts',
 'Annual_Income_Months_Since_Deliquency',
 'Loan_Amount_Requested_small_business',
 'Annual_Income_Own',
 'Annual_Income_Inquiries_Last_6Mo',
 'Loan_Amount_Requested_other',
 'Loan_Amount_Requested_major_purchase',
 'Annual_Income_educational',
 'Annual_Income_home_improvement',
 'Annual_Income_VERIFIED - income source',
 'Debt_To_Income',
 'home_improvement',
 'Annual_Income_house',
 'Annual_Income_credit_card',
 'Annual_Income',
 'Annual_Income_debt_consolidation',
 'wedding',
 'Annual_Income_medical',
 'educational',
 'house',
 'Total_Accounts',
 'medical',
 'Number_Open_Accounts',
 'Loan_Amount_Requested_wedding',
 'less_than4',
 'not verified',
 'Inquiries_Last_6Mo',
 'Other',
 'Annual_Income_major_purchase',
 'renewable_energy',
 'greater_than8',
 'moving',
 'Own',
 'Male',
 'debt_consolidation',
 'credit_card',
 'VERIFIED - income source',
 'other',
 'small_business']

In [None]:
xgb = XGBClassifier(
                learning_rate=0.2,
                n_estimators=150,
                subsample=1,
                colsample_bytree=1,
                objective='multi:softprob', 
                n_jobs=-1,
                scale_pos_weight=None, 
                verbosity=3,
                max_depth=6,
                min_child_weight =14,
                gamma = 0.3
               
)

In [None]:
def cross_val_evaluate(model,X,y,cv,scoring,verbose,model_name):
    weighted_f1s = cross_val_score(model,X,y,cv=cv,scoring=scoring,verbose=verbose,n_jobs=-1)
    mean_weighted_f1 = round(np.sum(weighted_f1s)/cv,5)
    print(f" -----------------------{model_name}-------------------------------")
    print(f" weightedF1 for folds = {weighted_f1s}\n And Mean weighted_f1 on cv = {mean_weighted_f1}\n\n")

# Top 60

In [None]:
## Top 60
top60=columns[:60].copy()

final_Xtrain = pd.concat([ob_Xtrain,X_train[top60]],axis=1)
final_Xtest = pd.concat([only_binned_test,test_df[top60]],axis=1)

final_Xtrain = final_Xtrain.loc[:,~final_Xtrain.columns.duplicated()]
final_Xtest = final_Xtest.loc[:,~final_Xtest.columns.duplicated()]

xgb60 = XGBClassifier(verbosity=3,random_state=21)
cross_val_evaluate(xgb60,final_Xtrain,y_train,3,'f1_weighted',4,"XGB")


# Top 50

In [None]:
## Top 50
top50=columns[:50].copy()


final_Xtrain = pd.concat([ob_Xtrain,X_train[top50]],axis=1)
final_Xtest = pd.concat([only_binned_test,test_df[top50]],axis=1)

final_Xtrain = final_Xtrain.loc[:,~final_Xtrain.columns.duplicated()]
final_Xtest = final_Xtest.loc[:,~final_Xtest.columns.duplicated()]

xgb50 = XGBClassifier(verbosity=3,random_state=21)
cross_val_evaluate(xgb50,final_Xtrain,y_train,3,'f1_weighted',4,"XGB")

# Top 40

In [None]:
## Top 40
top40=columns[:40].copy()


final_Xtrain = pd.concat([ob_Xtrain,X_train[top40]],axis=1)
final_Xtest = pd.concat([only_binned_test,test_df[top40]],axis=1)

final_Xtrain = final_Xtrain.loc[:,~final_Xtrain.columns.duplicated()]
final_Xtest = final_Xtest.loc[:,~final_Xtest.columns.duplicated()]

xgb40 = XGBClassifier(verbosity=3,random_state=21)
cross_val_evaluate(xgb40,final_Xtrain,y_train,3,'f1_weighted',4,"XGB")