## Import Packages

In [None]:
import os
import sys 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

In [None]:
path = '../input/home-credit-default-risk/'
train = pd.read_csv(path + "/application_train.csv")
test = pd.read_csv(path + "/application_test.csv")

In [None]:
train.head().T

In [None]:
print('Training set shape:', train.shape)
print('Test set shape:    ', test.shape)

In [None]:
train.TARGET.value_counts() / len(train)

## Preprocessing

In [None]:
int_features = train.iloc[:, 2:].select_dtypes('int64').columns.values
float_features = train.iloc[:, 2:].select_dtypes('float').columns.values
str_features = train.iloc[:, 2:].select_dtypes('O').columns.values

print(len(int_features))
print(len(float_features))
print(len(str_features))

In [None]:
num_features = np.hstack([int_features, float_features])
cat_features = str_features
features = np.hstack([num_features, cat_features])


num_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())  
    ]
)

cat_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ], 
    sparse_threshold=0.9
)

In [None]:
%%time
preprocessor.fit(train[features])
X_train = preprocessor.transform(train[features])
X_test = preprocessor.transform(test[features])

y_train = train.TARGET.values

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape: ', X_test.shape)

## Logistic Regression

In [None]:
%%time
lr_mod = LogisticRegression(C=10, solver='liblinear')
lr_mod.fit(X_train, y_train)

print('Training Acc:', lr_mod.score(X_train, y_train))
print('Training AUC:', roc_auc_score(y_train, lr_mod.predict_proba(X_train)[:,1]))

## Create Submission

In [None]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission.head()

In [None]:
submission['TARGET'] = lr_mod.predict_proba(X_test)[:,1]
submission.head()

In [None]:
submission.to_csv('my_submission.csv', index=False)