In [1]:
import pandas as pd
df = pd.read_csv(r'dataset.csv',delimiter = ';')
print(df.shape)
#print(df.dtypes.sort_values())

(99976, 43)


In [2]:
## distribution of dependent variable
df['default'].value_counts()

0.0    88688
1.0     1288
Name: default, dtype: int64

## Segrate train and test data

In [3]:
test_df = df.loc[df['default'].isnull()]
train_df = df.loc[~(df['default'].isnull())]
print(test_df.shape)
print(train_df.shape)

(10000, 43)
(89976, 43)


In [4]:
## drop uuid from train data
train_df = train_df.drop(columns = ['uuid'])
train_df.shape

(89976, 42)

## Missing value distribution and imputation
### Todo:
#### 1. Use different than mean for imputation
#### 2. Use automated imputation using modeling
#### 3. There is model specific imputation

In [5]:
## columns with total missing values
missing_count = train_df.isnull().sum().sort_values(ascending = False)
missing_df = pd.DataFrame(missing_count, columns = ['Frequency'])
missing_df['PrecentageMissing'] = (missing_df['Frequency'] / train_df.shape[0]) * 100
missing_df.head()

Unnamed: 0,Frequency,PrecentageMissing
worst_status_active_inv,62540,69.507424
account_worst_status_12_24m,60055,66.745577
account_worst_status_6_12m,54313,60.363875
account_incoming_debt_vs_paid_0_24m,53357,59.301369
account_worst_status_3_6m,51938,57.724282


In [None]:
## Drop columns with missing percentage more than 50
drop_columns = missing_df.loc[missing_df['PrecentageMissing'] >= 40].index
drop_columns

In [None]:
train_df.drop(columns=drop_columns, inplace = True)
y = train_df['default']
X = train_df.drop(columns = ['default'])
train_df.shape

## Divide dataset into train and test 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Prediction on True Test Data

In [None]:
## Do the same preprocessing that was done on train data
## 1. Missing value imputation - Done
## 2. categorical to numerical conversion - Done

## Oversampling using SMOTE

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [None]:
numeric_features = ['account_amount_added_12_24m',	'account_days_in_dc_12_24m',	'account_days_in_rem_12_24m',
                    'account_days_in_term_12_24m',	'age',	'avg_payment_span_0_12m',
                    'max_paid_inv_0_12m',	'max_paid_inv_0_24m',	'num_active_div_by_paid_inv_0_12m',	'num_active_inv',
                    'num_arch_dc_0_12m',	'num_arch_dc_12_24m',	'num_arch_ok_0_12m',	'num_arch_ok_12_24m',
                    'num_arch_rem_0_12m',	'num_arch_written_off_0_12m',	'num_arch_written_off_12_24m',	'num_unpaid_bills',
                    'status_last_archived_0_24m',	'status_2nd_last_archived_0_24m',	'status_3rd_last_archived_0_24m',
                    'status_max_archived_0_6_months',	'status_max_archived_0_12_months',	'status_max_archived_0_24_months',
                    'recovery_debt',	'sum_capital_paid_account_0_12m',	'sum_capital_paid_account_12_24m',
                    'sum_paid_inv_0_12m',	'time_hours']
categorical_features = ['name_in_email', 'merchant_group', 'merchant_category']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("training accuracy: {:.2f}".format(clf.score(X_train, y_train) * 100))
print("Test accuracy    : {:.2f}".format(clf.score(X_test, y_test) * 100))

In [None]:
uuid = test_df['uuid']
test_df.drop(columns = ['uuid', 'default'] + drop_columns.tolist(), inplace = True)
X_trans = preprocessor.transform(test_df)
p_default = clf.predict_proba(X_trans)
pred_df = pd.DataFrame({'uuid': uuid, 'pd': p_default[:, 1]})
pred_df.to_csv(r'output.csv', sep = ';', index = False)

In [None]:
import joblib
joblib.dump(clf, 'model')
joblib.dump(preprocessor, 'preprocessor')

## Gridsearch using cross validation
### Find best parameter for random forest, logistic regression and support vector machine

In [None]:
numeric_features = ['account_amount_added_12_24m',	'account_days_in_dc_12_24m',	'account_days_in_rem_12_24m',
                    'account_days_in_term_12_24m',	'age',	'avg_payment_span_0_12m',
                    'max_paid_inv_0_12m',	'max_paid_inv_0_24m',	'num_active_div_by_paid_inv_0_12m',	'num_active_inv',
                    'num_arch_dc_0_12m',	'num_arch_dc_12_24m',	'num_arch_ok_0_12m',	'num_arch_ok_12_24m',
                    'num_arch_rem_0_12m',	'num_arch_written_off_0_12m',	'num_arch_written_off_12_24m',	'num_unpaid_bills',
                    'status_last_archived_0_24m',	'status_2nd_last_archived_0_24m',	'status_3rd_last_archived_0_24m',
                    'status_max_archived_0_6_months',	'status_max_archived_0_12_months',	'status_max_archived_0_24_months',
                    'recovery_debt',	'sum_capital_paid_account_0_12m',	'sum_capital_paid_account_12_24m',
                    'sum_paid_inv_0_12m',	'time_hours']
categorical_features = ['name_in_email', 'merchant_group', 'merchant_category']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
clf = Pipeline(
    steps=[("overSampling", SMOTE(random_state = 42)),
           ("classifier", LogisticRegression())]
)


In [None]:
clf.fit(X_train, y_train)

In [10]:
yy = df.iloc[0].to_dict()

In [12]:
type(yy['account_worst_status_12_24m'])

numpy.float64

In [13]:
import numpy as np

In [14]:
np.nan

nan

In [16]:
yy = test_df.iloc[0].to_dict()

In [17]:
yy

{'uuid': '6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7',
 'default': nan,
 'account_amount_added_12_24m': 0,
 'account_days_in_dc_12_24m': 0.0,
 'account_days_in_rem_12_24m': 0.0,
 'account_days_in_term_12_24m': 0.0,
 'account_incoming_debt_vs_paid_0_24m': 0.0091346153846153,
 'account_status': 1.0,
 'account_worst_status_0_3m': 1.0,
 'account_worst_status_12_24m': nan,
 'account_worst_status_3_6m': 1.0,
 'account_worst_status_6_12m': 1.0,
 'age': 20,
 'avg_payment_span_0_12m': 6.4,
 'avg_payment_span_0_3m': 5.25,
 'merchant_category': 'Youthful Shoes & Clothing',
 'merchant_group': 'Clothing & Shoes',
 'has_paid': True,
 'max_paid_inv_0_12m': 7225.0,
 'max_paid_inv_0_24m': 7225.0,
 'name_in_email': 'F',
 'num_active_div_by_paid_inv_0_12m': 0.0,
 'num_active_inv': 0,
 'num_arch_dc_0_12m': 0,
 'num_arch_dc_12_24m': 0,
 'num_arch_ok_0_12m': 5,
 'num_arch_ok_12_24m': 0,
 'num_arch_rem_0_12m': 0,
 'num_arch_written_off_0_12m': 0.0,
 'num_arch_written_off_12_24m': 0.0,
 'num_unpaid_bills': 1,
 'sta