In [None]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

import datetime

In [None]:
data_dict = pd.read_csv('data_dict.csv')

train_bureau = pd.read_csv('train_bureau.csv')
train_data = pd.read_csv('train_data.csv')

test_bureau = pd.read_csv('test_bureau.csv')
test_data = pd.read_csv('test_data.csv')

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.head(3)

In [None]:
train_data.info()

In [None]:
train_data.describe().T

In [None]:
train_data = train_data.rename(columns= {"InstlmentMode":'InstalmentMode', "MaturityDAte":"MaturityDate", "Top-up Month":"TopUpMonth"})

In [None]:
train_data[['DisbursalDate','MaturityDate']] = train_data[['DisbursalDate','MaturityDate']].apply(pd.to_datetime) 
train_data['LoanDuration'] = (train_data['MaturityDate'] - train_data['DisbursalDate']).dt.days

In [None]:
#get correlations of each features in dataset
corrmat = train_data.corr()
plt.figure(figsize=(10,6))
cmap = sns.diverging_palette(0, 230, 70, 60, as_cmap=True)
sns.heatmap(corrmat, annot=True, cmap=cmap)

In [None]:
def info_data(data):
    Null = pd.Series(data.isnull().sum())
    Unique_Count = pd.Series(data.describe(include='all',datetime_is_numeric=True).loc['unique', :])
    Data_type = pd.Series(data.dtypes)
    info_abt_data = pd.DataFrame(({"Null":Null, "Unique Count": Unique_Count, "Data type": Data_type}))
    return info_abt_data

info_data(train_data)

In [None]:
def plot_missing_data(df):
    columns_with_null = df.columns[df.isna().sum() > 0]
    null_pct = (df[columns_with_null].isna().sum() / df.shape[0]).sort_values(ascending=False) * 100
    plt.figure(figsize=(8,6));
    sns.barplot(y = null_pct.index, x = null_pct, orient='h')
    plt.title('% Na values in dataframe by columns');
    
plot_missing_data(train_data)

In [None]:
train_data = train_data.drop(["Area", "City", "ZiPCODE", "SEX", "AssetID", "State", 'AuthDate', 'DisbursalDate', 'MaturityDate'], axis=1)

In [None]:
info_data(train_data)

In [None]:
train_data['TopUpMonth']= train_data['TopUpMonth'].replace({'No Top-up Service':0, '12-18 Months':1, '18-24 Months':2, '24-30 Months':3, '30-36 Months': 4, '36-48 Months':5, ' > 48 Months':6 })

In [None]:
train_data['PaymentMode'] = train_data['PaymentMode'].replace(' ', '_', regex=True)
train_data['PaymentMode'] = train_data['PaymentMode'].replace('_', '', regex=True)

In [None]:
train_data['PaymentMode'].unique()

In [None]:
train_data.head(2)

In [None]:
info_data(train_data)

In [None]:
train_data.AGE.value_counts(normalize=True)

In [None]:
train_data.MonthlyIncome.value_counts(normalize=True)

In [None]:
train_data.describe().T

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
train_data["AGE"] = pd.DataFrame(imputer.fit_transform(train_data[["AGE"]]))
train_data["MonthlyIncome"] = pd.DataFrame(imputer.fit_transform(train_data[["MonthlyIncome"]]))

In [None]:
train_data["LoanDuration"] = pd.DataFrame(imputer.fit_transform(train_data[["LoanDuration"]]))

In [None]:
train_data.head(2)

In [None]:
info_data(train_data)

In [None]:
train_data.shape

In [None]:
train_data.head()

--------------
--------------

In [None]:
train_data = pd.get_dummies(train_data)

train_data

X = train_data.drop(['TopUpMonth','ID'], axis = 1)
y = train_data['TopUpMonth'] 


print("Data For Training model\n\n","Input data: ", X.shape, "\n", "Output data: ",y.shape)

X.head()

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
print("Training Data :","X_train:", X_train.shape, "--- y_train:", y_train.shape,"\nTesting Data  :" " X_test:",X_test.shape, " --- y_test:",y_test.shape)


from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# define the model
model_xgbc = XGBClassifier()
# fit the model on train data
model_xgbc.fit(X_train, y_train)
# predict on test set
yhat_xgbc = model_xgbc.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat_xgbc)
print("---------------------------------------------")
print('Accuracy: %.2f' % (accuracy*100))
print("---------------------------------------------")
print(classification_report(y_test,yhat_xgbc))

In [None]:

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

def recover_train_test_target():
    global combined, data_train
    targets = data_train['Loan_Status'].map({'Y':1,'N':0})
    train = combined.head(614)
    test = combined.iloc[614:]
    return train, test, targets

train, test, targets = recover_train_test_target()


clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train, targets)

features = pd.DataFrame()
features['Feature'] = train.columns
features['Importance'] = clf.feature_importances_
features.sort_values(by=['Importance'], ascending=False, inplace=True)
features.set_index('Feature', inplace=True)

features.plot(kind='bar', figsize=(20, 10))

model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(train)
train_reduced.shape


test_reduced = model.transform(test)
test_reduced.shape

parameters = {'bootstrap': False,
              'min_samples_leaf': 3,
              'n_estimators': 50,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6}

model = RandomForestClassifier(**parameters)
model.fit(train, targets)


compute_score(model, train, targets, scoring='accuracy')


output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['Loan_ID'] = aux['Loan_ID']
df_output['Loan_Status'] = np.vectorize(lambda s: 'Y' if s==1 else 'N')(output)
df_output[['Loan_ID','Loan_Status']].to_csv('output.csv',index=False)

In [None]:


X=train.drop(['loan_default'],axis=1)
Y=train['loan_default']

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.33, random_state=42)


random_state=42
lgb_params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
predictions = test[['UniqueID']]
val_aucs = []

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    
    N = 5
    p_valid,yp = 0,0
    for i in range(N):
#         X_t, y_t = (X_train, y_train)
#         X_t = pd.DataFrame(X_t)
#         X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
      
        evals_result = {}
        lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )
        p_valid += lgb_clf.predict(X_val)
        yp += lgb_clf.predict(test)
    val_score = roc_auc_score(y_val, p_valid)
    val_aucs.append(val_score)
    
#     predictions['fold{}'.format(fold+1)] = yp/N


predictions['loan_status']=yp/5

predictions.to_csv('submission.csv',index=False)

