In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
application_describe = pd.read_csv("../input/home-credit-default-risk/HomeCredit_columns_description.csv")
test_sk_id_curr = application_test["SK_ID_CURR"]

print("Application shape: ",application_test.shape)
print("Application train: ",application_train.shape)

In [None]:
application_train.head()

In [None]:
application_test.head()

In [None]:
application_describe[application_describe["Table"]=="application_{train|test}.csv"]

In [None]:
del application_train['SK_ID_CURR']

In [None]:
application_train.describe()

In [None]:
application_test.describe()

In [None]:
missing_values = 100*application_train.isnull().sum().sort_values(ascending=False)/application_train.shape[0]
missing_values[missing_values!=0]

In [None]:
missing_values = 100*application_test.isnull().sum().sort_values(ascending=False)/application_test.shape[0]
missing_values[missing_values!=0]

In [None]:
corr_table = application_train.select_dtypes(exclude=[object]).corr()
corr_table.style.apply(lambda x: ["background: red" if abs(v) > 0.5 else "background: yellow" if abs(v) > 0.1 and abs(v)<0.5 else "background: green" if abs(v) > 0.01 and abs(v)<0.1 else "" for v in x], axis = 1)

In [None]:
features_for_model = [ 
    'NAME_CONTRACT_TYPE',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_GOODS_PRICE',
    'NAME_INCOME_TYPE',
    'NAME_HOUSING_TYPE',
    'NAME_EDUCATION_TYPE',
    'DAYS_BIRTH',
    'NAME_FAMILY_STATUS',
    'DAYS_EMPLOYED',
    'REGION_RATING_CLIENT',
    'CNT_FAM_MEMBERS',
    'FLAG_EMP_PHONE',
    'FLAG_WORK_PHONE',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',
    ]

application_y = application_train["TARGET"]
application_train = application_train[features_for_model]
application_test = application_test[features_for_model]


In [None]:
application_train = application_train.apply(lambda x: x.fillna(x.mean()) if x.dtype.kind in 'biufc' else x.fillna('.'))
application_test = application_test.apply(lambda x: x.fillna(x.mean()) if x.dtype.kind in 'biufc' else x.fillna('.'))

In [None]:
sns.histplot(application_y)

In [None]:
def plot_hist(label,size=(5,5)):
  plt.figure(figsize=size)
  sns.histplot(application_train[label],discrete="true")
  sns.histplot(application_train[label][application_y==0],color="green",discrete="true")
  plt.show()
  plt.figure(figsize=size)
  sns.histplot(application_train[label],discrete="true")
  sns.histplot(application_train[label][application_y==1],color="red",discrete="true")
  plt.show()
  plt.figure(figsize=size)
  sns.histplot(application_test[label],discrete="true")


In [None]:
plot_hist("CODE_GENDER")

In [None]:
plot_hist("FLAG_OWN_CAR")

In [None]:
plot_hist("FLAG_OWN_REALTY")

In [None]:
plot_hist("NAME_FAMILY_STATUS",(16, 6))

In [None]:
plot_hist("NAME_CONTRACT_TYPE")

In [None]:
plot_hist("NAME_INCOME_TYPE",(16,10))

In [None]:
plot_hist("CNT_CHILDREN",(10,10))

In [None]:
plot_hist("NAME_EDUCATION_TYPE",(10,10))

In [None]:
plot_hist("NAME_HOUSING_TYPE",(10,10))

In [None]:
sns.pairplot(application_train)

In [None]:
sns.pairplot(application_test)

In [None]:
application_train.dtypes

In [None]:
application_train.select_dtypes(include=[object]).apply(pd.Series.nunique, axis = 0)

In [None]:
application_train["NAME_EDUCATION_TYPE"] = application_train["NAME_EDUCATION_TYPE"].astype('category').cat.codes
application_test["NAME_EDUCATION_TYPE"] = application_test["NAME_EDUCATION_TYPE"].astype('category').cat.codes
application_train = pd.get_dummies(application_train)
application_test = pd.get_dummies(application_test)
application_train=application_train[application_test.columns]

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler 
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

rus = RandomOverSampler() 
X_rus, y_rus = rus.fit_resample(application_train, application_y)

train_x, test_x, train_y, test_y = train_test_split(X_rus, y_rus, test_size=0.2)

clf = LGBMClassifier()
clf.fit(train_x, train_y)

train_prediction = clf.predict_proba(train_x)[:, 1]
test_prediction = clf.predict_proba(test_x)[:, 1]

test_auc = roc_auc_score(test_y, test_prediction)
train_auc = roc_auc_score(train_y, train_prediction)

train_fpr, train_tpr, train_threshold  = roc_curve(train_y, train_prediction)
test_fpr, test_tpr, test_threshold  = roc_curve(test_y, test_prediction)

plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1],'r--')
plt.plot(test_fpr, test_tpr, marker='.', label=f'test {test_auc}')
plt.plot(train_fpr, train_tpr, marker='.', label=f'train {train_auc}')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

print(f'ROC AUC train {test_auc}' )
print(f'ROC AUC test {train_auc}' )

In [None]:
prediction = clf.predict_proba(application_test)
sumbit = pd.DataFrame({'SK_ID_CURR':test_sk_id_curr,'TARGET':prediction[:, 1]})
sumbit.to_csv('./submission.csv',index=False)