In [2]:
import pandas as pd
df = pd.read_csv(r'dataset.csv',delimiter = ';')
print(df.shape)
print(df.dtypes.sort_values())

(99976, 43)
has_paid                                  bool
status_last_archived_0_24m               int64
num_arch_dc_12_24m                       int64
num_arch_ok_0_12m                        int64
num_arch_ok_12_24m                       int64
num_arch_rem_0_12m                       int64
num_unpaid_bills                         int64
age                                      int64
num_arch_dc_0_12m                        int64
status_2nd_last_archived_0_24m           int64
num_active_inv                           int64
status_max_archived_0_6_months           int64
status_max_archived_0_12_months          int64
status_max_archived_0_24_months          int64
recovery_debt                            int64
sum_capital_paid_account_0_12m           int64
sum_capital_paid_account_12_24m          int64
account_amount_added_12_24m              int64
sum_paid_inv_0_12m                       int64
status_3rd_last_archived_0_24m           int64
num_arch_written_off_12_24m            float64
n

In [6]:
## distribution of dependent variable
df['default'].value_counts()

0.0    88688
1.0     1288
Name: default, dtype: int64

In [15]:
from sklearn import preprocessing
columns = ['name_in_email', 'merchant_group', 'merchant_category']
for column in columns:
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    df.loc[:, column] = le.transform(df[column])

## Segrate train and test data

In [16]:
test_df = df.loc[df['default'].isnull()]
train_df = df.loc[~(df['default'].isnull())]
print(test_df.shape)
print(train_df.shape)

(10000, 43)
(89976, 43)


In [18]:
## drop uuid from train data
train_df = train_df.drop(columns = ['uuid'])
train_df.shape

(89976, 42)

## Missing value distribution and imputation
### Todo:
#### 1. Use different than mean for imputation
#### 2. Use automated imputation using modeling
#### 3. There is model specific imputation

In [19]:
## columns with total missing values
missing_count = train_df.isnull().sum().sort_values(ascending = False)
missing_df = pd.DataFrame(missing_count, columns = ['Frequency'])
missing_df['PrecentageMissing'] = (missing_df['Frequency'] / train_df.shape[0]) * 100
missing_df

Unnamed: 0,Frequency,PrecentageMissing
worst_status_active_inv,62540,69.507424
account_worst_status_12_24m,60055,66.745577
account_worst_status_6_12m,54313,60.363875
account_incoming_debt_vs_paid_0_24m,53357,59.301369
account_worst_status_3_6m,51938,57.724282
account_worst_status_0_3m,48934,54.385614
account_status,48934,54.385614
avg_payment_span_0_3m,44382,49.326487
avg_payment_span_0_12m,21468,23.859696
num_active_div_by_paid_inv_0_12m,20658,22.959456


In [20]:
## Drop columns with missing percentage more than 50
drop_columns = missing_df.loc[missing_df['PrecentageMissing'] >= 40].index
drop_columns

Index(['worst_status_active_inv', 'account_worst_status_12_24m',
       'account_worst_status_6_12m', 'account_incoming_debt_vs_paid_0_24m',
       'account_worst_status_3_6m', 'account_worst_status_0_3m',
       'account_status', 'avg_payment_span_0_3m'],
      dtype='object')

In [21]:
train_df.drop(columns=drop_columns, inplace = True)
train_df.shape

(89976, 34)

In [22]:
### Missing value imputation using mean
mean = train_df.mean()
train_df = train_df.fillna(mean)
test_df = test_df.fillna(mean)
y = train_df['default']
X = train_df.drop(columns = ['default'])

## Divide dataset into train and test 

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Baseline model 

In [24]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 500)
model.fit(X_train, y_train)
print("training accuracy: {}".format(model.score(X_train, y_train)))
print("Test accuracy: {}".format(model.score(X_test, y_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training accuracy: 0.9852197136837915
Test accuracy: 0.9853164045397905


In [25]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("training accuracy: {}".format(clf.score(X_train, y_train)))
print("Test accuracy: {}".format(clf.score(X_test, y_test)))

training accuracy: 1.0
Test accuracy: 0.9861246758495268


## Gridsearch using cross validation
### Find best parameter for random forest, logistic regression and support vector machine

In [None]:
# param_grid = [
#   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
#  ]
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = [{'C': [1, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]

In [None]:
model = SVC(random_state=42)
clf = GridSearchCV(model, param_grid, cv = 3, verbose = 5)
clf.fit(X_train, y_train)

## Prediction on True Test Data

In [None]:
## Do the same preprocessing that was done on train data
## 1. Missing value imputation - Done
## 2. categorical to numerical conversion - Done

In [None]:
uuid = test_df['uuid']
test_df.drop(columns = ['uuid', 'default'] + drop_columns.tolist(), inplace = True)
p_default = model.predict(test_df)
pred_df = pd.DataFrame({'uuid': uuid, 'pd': p_default})
pred_df.to_csv(r'output.csv', sep = ';', index = False)

In [26]:
from imblearn.over_sampling import SMOTE

ImportError: cannot import name '_euclidean_distances' from 'sklearn.metrics.pairwise' (C:\Users\Admin\anaconda3\lib\site-packages\sklearn\metrics\pairwise.py)