In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
app_train.head()

In [None]:
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
app_test.head()

In [None]:
app_test.shape

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau.head()

In [None]:
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns={'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()

In [None]:
app_train = pd.merge(app_train, previous_loan_counts, on='SK_ID_CURR', how='left')

app_train['previous_loan_counts'].fillna(0, inplace=True)
app_train.head()

In [None]:
app_test = pd.merge(app_test, previous_loan_counts, on='SK_ID_CURR', how='left')

app_test['previous_loan_counts'].fillna(0, inplace=True)
app_test.head()

### trainとtestを結合

In [None]:
all_data = pd.concat([app_train,app_test], sort= False)

In [None]:
all_data.tail()

In [None]:
all_data_numna = all_data.isnull().sum()
all_data_numna = all_data_numna.drop(all_data_numna[all_data_numna == 0].index).sort_values(ascending= False)
na_df = pd.DataFrame(all_data_numna, columns=["Number of na"])
na_df["Ratio of na"] = all_data_numna/all_data.shape[0] *100
na_df

In [None]:
num_cols = []
for col in all_data.columns:
    if all_data[col].dtype != "object" and col != "TARGET":
        num_cols.append(col)

print("TARGET" in num_cols)

In [None]:
all_data.isnull().sum()

### 数値データの欠損値補完

In [None]:
scaler = StandardScaler()
all_data[num_cols] = scaler.fit_transform(all_data[num_cols])

In [None]:
all_data.tail()

In [None]:
for col in all_data.columns:
    if col != "TARGET":
        if all_data[col].isnull().sum() != 0 and all_data[col].dtype !="object":
            all_data[col] = all_data[col].fillna(0)

In [None]:
all_data = pd.get_dummies(all_data)

In [None]:
all_data.tail()

In [None]:
all_data.tail()

In [None]:
len(app_train)

In [None]:
app_train = all_data[:len(app_train)]
app_test = all_data[len(app_train):]

In [None]:
train_X = app_train.drop(["SK_ID_CURR", "TARGET"], axis = 1) 
train_y = app_train.TARGET
test_X = app_test.drop(["SK_ID_CURR","TARGET"], axis = 1)

In [None]:
test_X.shape

In [None]:
auc_scores = []
test_preds = np.zeros(test_X.shape[0])

#クロスバリデーションを行う。
kf = StratifiedKFold(n_splits= 5, shuffle = True, random_state=15)
for tr_idx, va_idx in kf.split(train_X, train_y):
    tr_x, va_x = train_X.iloc[tr_idx], train_X.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    lgrg = LogisticRegression(penalty='l1', solver='liblinear', max_iter=200)
    lgrg.fit(tr_x, tr_y)
    va_pred = lgrg.predict_proba(va_x)[:, 1]
    auc_score = roc_auc_score(va_y, va_pred)
    auc_scores.append(auc_score)
    test_preds += lgrg.predict_proba(test_X)[:, 1]/kf.n_splits
    print("per_epoch")

print(auc_scores, np.mean(auc_scores))

In [None]:
test_preds

In [None]:
sub = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv', encoding= 'UTF-8')
sub["TARGET"] = test_preds
sub.to_csv('submission.csv', index = False)