# **IMPORTING MODULES**

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None  # default='warn'

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
data_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
data = data_train.append(data_test)



# **РОЗМІР ДАНИХ**

In [None]:
print(f"Розмір даних train {data_train.shape}")
print(f"Розмір даних test {data_test.shape}")


print ('Перші 5 рядків даних:') 
data.head ()

# **ПЕРЕВІРКА НА НУЛЬ**

In [None]:
print(data.isnull().sum().sort_values(ascending=False)/len(data)*100)

# **КОРЕЛЯЦІЯ З ТАРГЕТОМ**

In [None]:
correlations = data_train.corr()['TARGET'].sort_values()
print(correlations)

# **ЗМЕНШУЄМО КІЛЬКІСТЬ ПОТРІБНИХ ДАНИХ**

In [None]:

features = [
    'TARGET',
    'NAME_CONTRACT_TYPE',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_GOODS_PRICE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'CNT_FAM_MEMBERS',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3'
]
reduced_train_data = data_train[features]
features.remove("TARGET")
reduced_test_data = data_test[features]

# **ПЕРЕВІРКА НА НУЛЬ**

In [None]:
print(reduced_train_data.isnull().sum().sort_values(ascending=False)/len(data)*100)

In [None]:
print(reduced_test_data.isnull().sum().sort_values(ascending=False)/len(data)*100)

# **ЗАПОВЕННЯ НУЛІВ СЕРЕДНІМИ ЗНАЧЕННЯМИ**

In [None]:
reduced_train_data[['EXT_SOURCE_3','EXT_SOURCE_2','AMT_GOODS_PRICE','CNT_FAM_MEMBERS']] = reduced_train_data[['EXT_SOURCE_3','EXT_SOURCE_2','AMT_GOODS_PRICE','CNT_FAM_MEMBERS']].fillna(value=reduced_train_data[['EXT_SOURCE_3','EXT_SOURCE_2','AMT_GOODS_PRICE','CNT_FAM_MEMBERS']].mean())
reduced_test_data[['EXT_SOURCE_3','EXT_SOURCE_2']] = reduced_test_data[['EXT_SOURCE_3','EXT_SOURCE_2']].fillna(value=reduced_test_data[['EXT_SOURCE_3','EXT_SOURCE_2']].mean())


# **ДИВИМОСЬ ТИПИ ДАНИХ І ШУКАЄМО КАТЕГОРІАЛЬНІ ДАНІ**

In [None]:
print(f"Типи даних {data.dtypes}")


In [None]:
data.select_dtypes(include=[object]).apply(pd.Series.nunique, axis = 0)


# **ПЕРЕВІРКА НА ДУПЛІКАТИ**

In [None]:
data[data.duplicated()]

# **ОПИС ДАНИХ І МЕДІАНА**

In [None]:
reduced_test_data.describe()


In [None]:
reduced_train_data.describe()

In [None]:
reduced_train_data.select_dtypes(include=["int64","float64"]).median()

In [None]:
reduced_test_data.select_dtypes(include=["int64","float64"]).median()

# **ПЕРЕТВОРЮЄМО ДАТУ НАРОДЖЕННЯ НА ДОДАТНІ ЗНАЧЕННЯ**

In [None]:
reduced_train_data['DAYS_BIRTH'] = abs(data_train['DAYS_BIRTH'])
reduced_test_data['DAYS_BIRTH'] = abs(data_test['DAYS_BIRTH'])


# **РОЗПОДІЛИ ДЕЯКИХ КРИТЕРІЇВ**

In [None]:
sns.histplot(reduced_train_data['TARGET'])

In [None]:
sns.histplot(data['CODE_GENDER'])

In [None]:
sns.histplot(reduced_train_data['NAME_EDUCATION_TYPE'])

In [None]:
reduced_train_data.hist(figsize=(16, 20), color = 'g', bins=30, xlabelsize=8, ylabelsize=8)

In [None]:
sns.histplot(reduced_test_data["DAYS_BIRTH"]/365,kde=False)

In [None]:
reduced_test_data.hist(figsize=(16, 20), color = 'g', bins=30, xlabelsize=8, ylabelsize=8)

# **ПЕРЕТВОРЕННЯ ДЕЯКИХ КРИТЕРІЇВ В КАТЕГОРІЇ**

In [None]:
categorial_features= ["NAME_EDUCATION_TYPE"]
reduced_train_data["NAME_EDUCATION_TYPE"] = reduced_train_data["NAME_EDUCATION_TYPE"].astype('category').cat.codes

reduced_test_data["NAME_EDUCATION_TYPE"] = reduced_test_data["NAME_EDUCATION_TYPE"].astype('category').cat.codes



In [None]:
reduced_train_data = pd.get_dummies(reduced_train_data)
reduced_test_data = pd.get_dummies(reduced_test_data)
reduced_train_data.head()

In [None]:
reduced_train_data.dtypes

# **PAIRPLOT**

In [None]:
sns.pairplot(reduced_train_data.select_dtypes(include=["int64","float64"]))

In [None]:
sns.pairplot(reduced_test_data.select_dtypes(include=["int64","float64"]))

In [None]:
final_train_data=reduced_train_data[reduced_test_data.columns]

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd 
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

data_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
data_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")

features = [
    'TARGET',
    'NAME_CONTRACT_TYPE',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY',
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_GOODS_PRICE',
    'NAME_INCOME_TYPE',
    'NAME_EDUCATION_TYPE',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'CNT_FAM_MEMBERS',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3'
]

reduced_train_data = data_train[features]
features.remove("TARGET")
reduced_test_data = data_test[features]
reduced_train_data[['EXT_SOURCE_3','EXT_SOURCE_2','AMT_GOODS_PRICE','CNT_FAM_MEMBERS']] = reduced_train_data[['EXT_SOURCE_3','EXT_SOURCE_2','AMT_GOODS_PRICE','CNT_FAM_MEMBERS']].fillna(value=reduced_train_data[['EXT_SOURCE_3','EXT_SOURCE_2','AMT_GOODS_PRICE','CNT_FAM_MEMBERS']].mean())
reduced_test_data[['EXT_SOURCE_3','EXT_SOURCE_2']] = reduced_test_data[['EXT_SOURCE_3','EXT_SOURCE_2']].fillna(value=reduced_test_data[['EXT_SOURCE_3','EXT_SOURCE_2']].mean())
reduced_train_data['DAYS_BIRTH'] = abs(data_train['DAYS_BIRTH'])
reduced_test_data['DAYS_BIRTH'] = abs(data_test['DAYS_BIRTH'])
categorial_features= ["NAME_EDUCATION_TYPE"]
reduced_train_data["NAME_EDUCATION_TYPE"] = reduced_train_data["NAME_EDUCATION_TYPE"].astype('category').cat.codes

reduced_test_data["NAME_EDUCATION_TYPE"] = reduced_test_data["NAME_EDUCATION_TYPE"].astype('category').cat.codes
reduced_train_data = pd.get_dummies(reduced_train_data)
reduced_test_data = pd.get_dummies(reduced_test_data)
final_train_data=reduced_train_data[reduced_test_data.columns]
len(final_train_data.columns)

train_x, test_x, train_y, test_y = train_test_split(final_train_data, reduced_train_data["TARGET"], test_size=0.2)

clf = LGBMClassifier()
clf.fit(train_x, train_y)
train_preds = clf.predict(train_x)
print(f"Train Accuracy: {accuracy_score(train_y, train_preds)}")
test_preds = clf.predict(test_x)
print(f"Test Accuracy: {accuracy_score(test_y, test_preds)}")


probabilities = clf.predict_proba(reduced_test_data)
pd.DataFrame({
    'SK_ID_CURR': data_test['SK_ID_CURR'],
    'TARGET':     [ row[1] for row in probabilities]
}).to_csv('./sample_submission.csv',index=False)
test_preds = clf.predict_proba(test_x)
test_preds = test_preds[:, 1]
train_preds = clf.predict_proba(train_x)
train_preds = train_preds[:, 1]
lr_auc = roc_auc_score(test_y, test_preds)
tr_auc = roc_auc_score(train_y, train_preds)
print('train:ROC AUC=%.3f' % (tr_auc))
print('test:ROC AUC=%.3f' % (lr_auc))
tr_fpr, tr_tpr, _ = roc_curve(train_y, train_preds)

lr_fpr, lr_tpr, _ = roc_curve(test_y, test_preds)
plt.plot(lr_fpr, lr_tpr, marker='.', label='test')
plt.plot(tr_fpr, tr_tpr, marker='.', label='train')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()