In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/home-credit-default-risk/application_train.csv", index_col='SK_ID_CURR')
df = data.drop("TARGET", axis=1)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X, y = data.drop('TARGET', axis=1), data['TARGET']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
# Check missing value ratios
pd.set_option('display.max_row', 122)
pd.set_option('display.max_column', 122)

missing_ratio = (df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False)
missing_df = missing_ratio.to_frame(name="missing_ratio")
missing_df["dtype"] = [df[col].dtype for col in missing_df.index]
print(missing_df)

In [None]:
discrete_cols = df.select_dtypes(include='int').columns.tolist()

# Bring high cordinality columns to numerical pipeline
numerical_cols = df.select_dtypes(include='float').columns.tolist() \
                    + [col for col in discrete_cols if df[col].nunique() >= 30]
low_card_discrete_cols = [col for col in discrete_cols if df[col].nunique() < 30]
categorical_cols = df.select_dtypes(include='object').columns.tolist()

In [None]:
def split_cols(df, cols):
    discrete_cols = df[cols].select_dtypes(include='int').columns.tolist()

    # Bring high cordinality columns to numerical pipeline
    numerical_cols = df[cols].select_dtypes(include='float').columns.tolist() \
                        + [col for col in discrete_cols if df[col].nunique() >= 30]
    low_card_discrete_cols = [col for col in discrete_cols if df[col].nunique() < 30]
    categorical_cols = df[cols].select_dtypes(include='object').columns.tolist()
    return numerical_cols, low_card_discrete_cols, categorical_cols

# missing_df is sorted by missing_ratio
col_groups = []
group = []
for col in missing_df.index:
    if len(col_groups) == 0 or missing_df.loc[col, "missing_ratio"] <= 0.0001:
        col_groups.append([col])
        continue
    diff = abs(missing_df.loc[col_groups[-1][-1], "missing_ratio"] - missing_df.loc[col, "missing_ratio"])
    if diff <= 0.00001:
        col_groups[-1].append(col)
    else:
        col_groups.append([col])

compress_able_cols = [group for group in col_groups if len(group) == 3]
compress_cols = [group[0] for group in compress_able_cols]

In [None]:
flatten_list = [col for cols in compress_able_cols for col in cols]
selected_cols = [col for col in df.columns if col not in flatten_list] + compress_cols

num_cols, low_cols, cat_cols = split_cols(df, selected_cols)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, plot_roc_curve

# Column preprocessing
numerical_pipeline = Pipeline([('numerical_imputer', SimpleImputer(strategy="median")),
                               ('numerical_scaler', StandardScaler()),
                               ('numerical_selector', SelectKBest(k=10))])

low_card_discrete_pipeline = Pipeline([('low_card_imputer', SimpleImputer(strategy="most_frequent")),
                                       ('low_card_selector', SelectKBest(chi2, k=10))])

categorical_pipeline = Pipeline([('categorical_imputer', SimpleImputer(strategy="most_frequent")),
                                 ('categorical_encoder', OneHotEncoder()),
                                 ('categorical_selector', SelectKBest(chi2, k=10))])

preprocessor = make_column_transformer((numerical_pipeline, num_cols),
                                       (low_card_discrete_pipeline, low_cols),
                                       (categorical_pipeline, cat_cols))

# Model pipeline
model = Pipeline([('preprocessor', preprocessor),
                  ('classifier', SGDClassifier(loss='log', random_state=0))])

model.fit(X_train, y_train)
y_score = model.predict_proba(X_val)
score = roc_auc_score(y_val, y_score[:,1])
print("ROC score:", score)
plot_roc_curve(model, X_val, y_val, drop_intermediate=False, name="Baseline")
plt.plot([0, 1], [0, 1], linestyle='--', color='r')
plt.show()

In [None]:
df_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv", index_col='SK_ID_CURR')

preds_test = model.predict(df_test)
output_df = pd.DataFrame({"SK_ID_CURR": df_test.index,
                          "TARGET": preds_test})
output_df.to_csv("thanh_dive_submission.csv", index=False)