IMPORT:

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import os

import matplotlib.pyplot as plt
import seaborn as sns

Loading Datasets

In [None]:
train_df = pd.read_csv('datasets/kaggle/application_train.csv')
test_df = pd.read_csv('datasets/kaggle/application_test.csv')

# Preprocessing

In [None]:
le = LabelEncoder()
le_count = 0

for col in train_df:
    if train_df[col].dtype == 'object':
        if len(list(train_df[col].unique())) <= 2:
            le.fit(train_df[col])
            train_df[col] = le.transform(train_df[col])
            test_df[col] = le.transform(test_df[col])
            le_count += 1

train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)


In [None]:
labels = train_df['TARGET']

train_df, test_df = train_df.align(test_df, join = 'inner', axis = 1)

train_df['TARGET'] = labels

In [None]:
train_df['DAYS_EMPLOYED_ANOM'] = train_df["DAYS_EMPLOYED"] == 365243
train_df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

train_df['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram')
plt.xlabel('Days Employment')

In [None]:
test_df['DAYS_EMPLOYED_ANOM'] = test_df["DAYS_EMPLOYED"] == 365243
test_df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

In [None]:
correlations = train_df.corr()['TARGET'].sort_values()

In [None]:
print(correlations.head(20))

In [None]:
print(correlations.tail(20))

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

if 'TARGET' in train_df:
    train = train_df.drop(columns = ['TARGET'])
else:
    train = train_df.copy()
    
features = list(train.columns)

test = test_df.copy()

imputer = SimpleImputer(strategy = 'median')

scaler = MinMaxScaler(feature_range = (0, 1))

imputer.fit(train)

train = imputer.transform(train)
test = imputer.transform(test_df)

scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C = 0.0001)
log_reg.fit(train, labels)

In [None]:
log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [None]:
submit = test_df[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.head()

In [None]:
submit.to_csv('log_reg_baseline.csv', index = False)