In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from numpy import mean
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
pd.set_option("display.max_rows", 30)
d_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
d_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
d_train

In [None]:
d_train.info()

In [None]:
d_train.isnull().sum()

In [None]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

In [None]:
missing_values_table(d_train)

In [None]:
missing_values_table(d_test)

In [None]:
d_train.dtypes.value_counts()

In [None]:
d_train.select_dtypes(include=[object]).apply(pd.Series.nunique, axis = 0)

In [None]:
pd.set_option('display.max_rows', None)
correlation= d_train.corr()['TARGET'].sort_values()
print(correlation)

In [None]:
features=[
    'TARGET',
    'EXT_SOURCE_3',
    'EXT_SOURCE_2',
    'EXT_SOURCE_1',
    'DAYS_EMPLOYED',
    'FLOORSMAX_AVG',
    'FLOORSMAX_MEDI',
    'FLOORSMAX_MODE',
    'AMT_GOODS_PRICE',
    'REGION_POPULATION_RELATIVE',
    'DAYS_LAST_PHONE_CHANGE',
    'REGION_RATING_CLIENT',
    'REGION_RATING_CLIENT_W_CITY',
    'DAYS_BIRTH',
    'DAYS_ID_PUBLISH',
    'REG_CITY_NOT_WORK_CITY'    
]
d_train=d_train[features]
features.remove("TARGET")
d_test=d_test[features]

In [None]:

pd.set_option('display.max_rows', 10)
d_train

In [None]:
missing_values_table(d_train)

In [None]:
columns=[
    'EXT_SOURCE_3',
    'EXT_SOURCE_2',
    'EXT_SOURCE_1',
    'FLOORSMAX_AVG',
    'FLOORSMAX_MEDI',
    'FLOORSMAX_MODE',
    'AMT_GOODS_PRICE',
    'DAYS_LAST_PHONE_CHANGE',   
]
for col in columns:
    d_train[col]=d_train[col].fillna(d_train[col].mean())
    d_test[col]=d_test[col].fillna(d_test[col].mean())

In [None]:
missing_values_table(d_train)
d_train.isnull().sum()

In [None]:
d_test.dtypes.value_counts()

In [None]:
train_labels = d_train['TARGET']
d_train=d_train.drop('TARGET',1)
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
train_labels


In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
train = d_train.copy()
features = list(train.columns)
test = d_test.copy()
imputer = SimpleImputer(strategy = 'median')
scaler = MinMaxScaler(feature_range = (0, 1))
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(d_test)
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [None]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier()
clf.fit(train, train_labels)
predictions = clf.predict_proba(test)[:, 1]
d_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
submit = d_test[['SK_ID_CURR']]
submit['TARGET'] = predictions
submit.to_csv('lightgbm_baseline.csv', index = False)

In [None]:
from matplotlib import pyplot as plt
img = plt.imread('../input/img123/image.jpg')
plt.imshow(img)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score
train_x, test_x, train_y, test_y = train_test_split(train, train_labels, test_size=0.2)
train_preds = clf.predict(train_x)
print(f"Train Accuracy: {accuracy_score(train_y, train_preds)}")
test_preds = clf.predict(test_x)
print(f"Test Accuracy: {accuracy_score(test_y, test_preds)}")
test_preds = clf.predict_proba(test_x)
test_preds = test_preds[:, 1]
train_preds = clf.predict_proba(train_x)
train_preds = train_preds[:, 1]
lr_auc = roc_auc_score(test_y, test_preds)
tr_auc = roc_auc_score(train_y, train_preds)
print('train:ROC AUC=%.3f' % (tr_auc))
print('test:ROC AUC=%.3f' % (lr_auc))
tr_fpr, tr_tpr, _ = roc_curve(train_y, train_preds)

lr_fpr, lr_tpr, _ = roc_curve(test_y, test_preds)
plt.plot(lr_fpr, lr_tpr, marker='.', label='test')
plt.plot(tr_fpr, tr_tpr, marker='.', label='train')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()