In [None]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from tqdm import tqdm
import gc; gc.enable()
DATA_DIR = os.path.join('..', 'data')

train = pd.read_csv("../data/cs-training.csv")
test = pd.read_csv("../data/cs-test.csv")
dataset= pd.concat([train, test])
del train, test; gc.collect()

In [1]:
from sklearn.linear_model import LogisticRegression
LogisticRegression?

# Transform dataset

In [None]:
def cleaned_dataset(dataset):
    dataset.loc[dataset["age"] <= 18, "age"] = dataset.age.median()
    
    age_working = dataset.loc[(dataset["age"] >= 18) & (dataset["age"] < 60)]
    age_senior = dataset.loc[(dataset["age"] >= 60)]

    age_working_impute = age_working.MonthlyIncome.mean()
    age_senior_impute = age_senior.MonthlyIncome.mean()

    dataset["MonthlyIncome"] = np.absolute(dataset["MonthlyIncome"])
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].fillna(99999)
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].astype('int64')

    dataset.loc[((dataset["age"] >= 18) & (dataset["age"] < 60)) & (dataset["MonthlyIncome"] == 99999),\
                   "MonthlyIncome"] = age_working_impute
    dataset.loc[(dataset["age"] >= 60) & (dataset["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute
    dataset["NumberOfDependents"] = np.absolute(dataset["NumberOfDependents"])
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].fillna(0)
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].astype('int64')

    dataset["CombinedDefaulted"] = (dataset["NumberOfTimes90DaysLate"] + dataset["NumberOfTime60-89DaysPastDueNotWorse"])\
                                            + dataset["NumberOfTime30-59DaysPastDueNotWorse"]

    dataset.loc[(dataset["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1

    dataset["CombinedCreditLoans"] = dataset["NumberOfOpenCreditLinesAndLoans"] + \
                                            dataset["NumberRealEstateLoansOrLines"]
    dataset.loc[(dataset["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
    dataset.loc[(dataset["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

    dataset.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
                 "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines",\
                  "NumberOfTime60-89DaysPastDueNotWorse"], axis=1, inplace=True)
    train = dataset[~dataset.SeriousDlqin2yrs.isnull()]
    test = dataset[dataset.SeriousDlqin2yrs.isnull()].drop("SeriousDlqin2yrs", axis=1)
    cols = set(test.columns)
    target = train.SeriousDlqin2yrs
    numeric = train.select_dtypes(include=['int64', 'float64'])
    numeric.drop("SeriousDlqin2yrs",axis=1, inplace=True)
    numeric_fill = numeric.mean()
    
    numeric = numeric.fillna(numeric_fill)
    
    train[numeric.columns] = numeric
    test[numeric.columns] = test[numeric.columns].fillna(numeric_fill)

    sc = StandardScaler()


    train = pd.concat(
        [train, pd.DataFrame(
            sc.fit_transform(numeric),
            columns=['sc_{}'.format(i) for i in numeric.columns],
            index=train.index
        )], axis=1)
    
    test = pd.concat(
        [test, pd.DataFrame(
            sc.transform(test[numeric.columns].fillna(numeric_fill)),
            columns=['sc_{}'.format(i) for i in numeric.columns],
            index=test.index
        )], axis=1)
    for d in [train,test]:
        d.drop(cols, axis=1, inplace=True)
    train["SeriousDlqin2yrs"] = target
    del dataset; gc.collect()
    return train, test

In [None]:
train, test = cleaned_dataset(dataset)

In [None]:
train.to_csv(os.path.join(DATA_DIR, 'transformed-train.csv'))
test.to_csv(os.path.join(DATA_DIR, 'transformed-test.csv'))