## K-S tree logistic regression

In [None]:
# import packages

import pandas as pd
import numpy as np

# used in 1.3 and 4.1
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# used in 4.2
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# used in 5
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

### 1. data preprocessing

#### 1.1 data overview

Read training and testing data.

In [3]:
train = pd.read_csv('../../data/splited/train.csv')
test = pd.read_csv('../../data/splited/test.csv')
train_label = train.iloc[:, 1]
train_features = train.iloc[:, 2 :]
test_label = test.iloc[:, 1]
test_features = test.iloc[:, 2 :]

Features with `cat` as postfix in names are categorical features. Features with `bin` as postfix in names are binary features. Features without postfix in names are continuous features.

Features are divided into 4 groups: `ind`, `reg`, `car`, `calc`.

In [4]:
train_features.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

#### 1.2 missing values

Values of `-1` mean missing values. Codes below deal with missing values.

In [5]:
# replace "-1" with "np.nan"
train_features = train_features.replace(-1, np.nan)
test_features = test_features.replace(-1, np.nan)

In [6]:
# ratio of missing values in train_features
train_features_missing_ratio = train_features.isnull().mean()
train_features_missing_ratio = train_features_missing_ratio[train_features_missing_ratio > 0] # only consider features with non-zero missing ratio
train_features_top_missing_ratio = train_features_missing_ratio.sort_values(ascending=False)
print("missing ratio in train_features:\n")
print(train_features_top_missing_ratio)

missing ratio in train_features:

ps_car_03_cat    0.690852
ps_car_05_cat    0.447769
ps_reg_03        0.181335
ps_car_14        0.071324
ps_car_07_cat    0.019066
ps_ind_05_cat    0.009773
ps_car_09_cat    0.000931
ps_ind_02_cat    0.000343
ps_car_01_cat    0.000178
ps_ind_04_cat    0.000137
ps_car_11        0.000007
ps_car_02_cat    0.000005
ps_car_12        0.000002
dtype: float64


In [7]:
# ratio of missing values in test_features
test_features_missing_ratio = test_features.isnull().mean()
test_features_missing_ratio = test_features_missing_ratio[test_features_missing_ratio > 0] # only consider features with non-zero missing ratio
test_features_top_missing_ratio = test_features_missing_ratio.sort_values(ascending=False)
print("missing ratio in train_features:\n")
print(test_features_top_missing_ratio)

missing ratio in train_features:

ps_car_03_cat    0.691007
ps_car_05_cat    0.447957
ps_reg_03        0.180434
ps_car_14        0.072260
ps_car_07_cat    0.019853
ps_ind_05_cat    0.009728
ps_car_09_cat    0.001014
ps_ind_02_cat    0.000409
ps_car_01_cat    0.000185
ps_ind_04_cat    0.000146
ps_car_02_cat    0.000017
ps_car_11        0.000011
dtype: float64


Features `ps_car_03_cat`, `ps_car_05_cat` and `ps_reg_03` have too high missing ratio in both training and testing set, directedly delete these two features.

In [8]:
columns_to_drop = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03']
train_features = train_features.drop(columns=columns_to_drop, errors='ignore') # ignore error if columns to delete don't exist
test_features = test_features.drop(columns=columns_to_drop, errors='ignore')

Other features have missing ratio lower than 0.08. Fill continuous features with medieans of these features. Fill categorical and binary features with modes of these features. 

In [9]:
# names of features with low missing radio
columns_to_fill_with_mode = ['ps_car_07_cat', 'ps_ind_05_cat', 'ps_car_09_cat',
                             'ps_ind_02_cat', 'ps_car_01_cat', 'ps_ind_04_cat',
                             'ps_car_02_cat']
columns_to_fill_with_median = ['ps_car_14', 'ps_car_11', 'ps_car_12']

# training set
train_features[columns_to_fill_with_mode] = train_features[columns_to_fill_with_mode].fillna(
    train_features[columns_to_fill_with_mode].mode().iloc[0]
)
train_features[columns_to_fill_with_median] = train_features[columns_to_fill_with_median].fillna(
    train_features[columns_to_fill_with_median].median()
)


# testing set
test_features[columns_to_fill_with_mode] = test_features[columns_to_fill_with_mode].fillna(
    test_features[columns_to_fill_with_mode].mode().iloc[0]
)
test_features[columns_to_fill_with_median] = test_features[columns_to_fill_with_median].fillna(
    test_features[columns_to_fill_with_median].median()
)

#### 1.3 standardization and encoding

Standardize continuous features.

In [10]:
# names of continuous features
continuous_features = ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15',
                       'ps_reg_01', 'ps_reg_02', 'ps_car_11', 'ps_car_12',
                       'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
                       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
                       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
                       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']

In [11]:
scaler = StandardScaler()
train_continuous = scaler.fit_transform(train_features[continuous_features])
test_continuous = scaler.transform(test_features[continuous_features])

Encode categorical features using one-hot encoding.

In [12]:
# names of categorical features
categorical_features = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat','ps_car_01_cat',
                        'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_07_cat',
                        'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']

In [13]:
encoder = OneHotEncoder(sparse=False)
train_categorical = encoder.fit_transform(train_features[categorical_features])
test_categorical = encoder.transform(test_features[categorical_features])

Binary features remain fixed.

In [14]:
# names of binary features
binary_features = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
                   'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
                   'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin',
                   'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']

In [15]:
train_binary = train_features[binary_features].values
test_binary = test_features[binary_features].values

Integrate three kinds of features into final traning and testing set.

In [16]:
train_features_processed = np.hstack((train_continuous, train_categorical, train_binary))
test_features_processed = np.hstack((test_continuous, test_categorical, test_binary))

Variables `train_features_processed` and `test_features_processed` are numpy arrays.

In [17]:
type(train_features_processed), train_features_processed.shape

(numpy.ndarray, (416648, 213))

In [18]:
type(test_features_processed), test_features_processed.shape

(numpy.ndarray, (178564, 213))

### 2. EDA

Numpy arrays `train_features_processed` and `test_features_processed`, obtained in section 1.3 in this notebook, can be used here.

According setion 3 of the "property refinance prediction" paper, EDA should focus on the calculation of the complexity (overlapping area) of unsplitted data.

### 3. K-S static

#### 3.1 K-S static calculation

Calculating K-S static has no requirement of standardization and encoding. So dataframes `train_features`, obtained in section 1.2 in this notebook, can be used here.

Calculating K-S static requires label information. So dataframes `train_label`, obtained in section 1.1 in this notebook, can be used here.

In [31]:
ks_results = []

for feature_name in train_features.columns:
    # construct dataframe
    temp_data = {
        'feature': list(train_features[feature_name].values),
        'target': list(train_label.values)
    }
    temp_df = pd.DataFrame(temp_data)
    temp_df = temp_df.sort_values(by='feature')
    # calculate ks_statistic of the given feature
    if 'cat' not in str(feature_name) and 'bin' not in str(feature_name):
        temp_df['cum_positive'] = (temp_df['target'] == 1).cumsum() / (temp_df['target'] == 1).sum() # cdf of label 1
        temp_df['cum_negative'] = (temp_df['target'] == 0).cumsum() / (temp_df['target'] == 0).sum() # cdf of label 0
        temp_df['diff'] = np.abs(temp_df['cum_positive'] - temp_df['cum_negative'])
        ks_statistic = temp_df['diff'].max()
        max_diff_feature_value = temp_df.loc[temp_df['diff'].idxmax(), 'feature'] # value of the continuous feature at which maximum cdf difference is achieved
    else:
        category_binary_stats = temp_df.groupby('feature')['target'].value_counts(normalize=False).unstack(fill_value=0)
        label_counts = temp_df['target'].value_counts()
        category_binary_stats[0] = category_binary_stats[0] / label_counts[0]
        category_binary_stats[1] = category_binary_stats[1] / label_counts[1]
        category_binary_stats['abs_diff'] = (category_binary_stats[0] - category_binary_stats[1]).abs()
        ks_statistic = category_binary_stats['abs_diff'].max()
        max_diff_feature_value = category_binary_stats['abs_diff'].idxmax() # value of the categorical/binary feature at which maximum proportion difference is achieved

    ks_results.append({'feature_name': feature_name, 'ks_statistic': ks_statistic, 'max_diff_feature_value': max_diff_feature_value})

ks_df = pd.DataFrame(ks_results)
ks_df_sorted = ks_df.sort_values(by='ks_statistic', ascending=False)
print(ks_df_sorted)

      feature_name  ks_statistic  max_diff_feature_value
31       ps_car_13      0.114100                0.832079
5    ps_ind_06_bin      0.088906                0.000000
19       ps_reg_02      0.087595                0.400000
6    ps_ind_07_bin      0.079796                0.000000
30       ps_car_12      0.077120                0.374166
22   ps_car_04_cat      0.073953                0.000000
33       ps_car_15      0.071902                3.316625
15   ps_ind_16_bin      0.070316                0.000000
20   ps_car_01_cat      0.067117                7.000000
16   ps_ind_17_bin      0.065833                1.000000
21   ps_car_02_cat      0.063247                0.000000
18       ps_reg_01      0.060861                0.600000
14       ps_ind_15      0.058136                7.000000
0        ps_ind_01      0.052497                2.000000
4    ps_ind_05_cat      0.050838                0.000000
2        ps_ind_03      0.045410                5.000000
25   ps_car_08_cat      0.03948

#### 3.2 feature selection and data segementation via K-S statistc

Calculated K-S statistic is stored in a dataframe named `ks_df_sorted`. Select features with K-S statistic higher than 0.08.

In [32]:
selected_features = ks_df_sorted[ks_df_sorted['ks_statistic'] > 0.08]
selected_features

Unnamed: 0,feature_name,ks_statistic,max_diff_feature_value
31,ps_car_13,0.1141,0.832079
5,ps_ind_06_bin,0.088906,0.0
19,ps_reg_02,0.087595,0.4


Three features are selected: `ps_car_13`, `ps_ind_06_bin`, `ps_reg_02`. The whole training data is first segemented into `ps_car_13 <= 0.832079` and `ps_car_13 > 0.832079`, resulting in 2 segemented training data. The 2 segemented training data are then segemented into `ps_ind_06_bin == 1` and `ps_ind_06_bin == 0`, resulting in 4 segemented training data. The 4 segemented training data are again segemented into `ps_reg_02 <= 0.4` and `ps_reg_02 > 0.4`, resulting in 8 segemented training data.

In [61]:
train_features_1 = train_features[(train_features['ps_car_13'] <= 0.832079) & (train_features['ps_ind_06_bin'] == 1) & (train_features['ps_reg_02'] <= 0.4)]
train_label_1 = train_label[train_features_1.index]

train_features_2 = train_features[(train_features['ps_car_13'] <= 0.832079) & (train_features['ps_ind_06_bin'] == 1) & (train_features['ps_reg_02'] > 0.4)]
train_label_2 = train_label[train_features_2.index]

train_features_3 = train_features[(train_features['ps_car_13'] <= 0.832079) & (train_features['ps_ind_06_bin'] == 0) & (train_features['ps_reg_02'] <= 0.4)]
train_label_3 = train_label[train_features_3.index]

train_features_4 = train_features[(train_features['ps_car_13'] <= 0.832079) & (train_features['ps_ind_06_bin'] == 0) & (train_features['ps_reg_02'] > 0.4)]
train_label_4 = train_label[train_features_4.index]

train_features_5 = train_features[(train_features['ps_car_13'] > 0.832079) & (train_features['ps_ind_06_bin'] == 1) & (train_features['ps_reg_02'] <= 0.4)]
train_label_5 = train_label[train_features_5.index]

train_features_6 = train_features[(train_features['ps_car_13'] > 0.832079) & (train_features['ps_ind_06_bin'] == 1) & (train_features['ps_reg_02'] > 0.4)]
train_label_6 = train_label[train_features_6.index]

train_features_7 = train_features[(train_features['ps_car_13'] > 0.832079) & (train_features['ps_ind_06_bin'] == 0) & (train_features['ps_reg_02'] <= 0.4)]
train_label_7 = train_label[train_features_7.index]

train_features_8 = train_features[(train_features['ps_car_13'] > 0.832079) & (train_features['ps_ind_06_bin'] == 0) & (train_features['ps_reg_02'] > 0.4)]
train_label_8 = train_label[train_features_8.index]

### 4. data segements processing, resampling and training

#### 4.1 data segments processing

This is similar to section 1.3 in this notebook. In 8 data segements `train_features_1` ~ `train_features_8`, continuous features shoud be standardized and categorial features should be one-hot encoded.

In [68]:
train_features_segements = [train_features_1, train_features_2, train_features_3, train_features_4,
                            train_features_5, train_features_6, train_features_7, train_features_8]
train_features_segements_processed = []

# names of continuous features
continuous_features = ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15',
                       'ps_reg_01', 'ps_reg_02', 'ps_car_11', 'ps_car_12',
                       'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
                       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
                       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
                       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']
# names of categorical features
categorical_features = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat','ps_car_01_cat',
                        'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_07_cat',
                        'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
encoder = OneHotEncoder(sparse=False)
encoder.fit(train_features[categorical_features]) # fit one-hot encoder on the entire training set to avoid different number of features in data_segements_processed
# names of binary features
binary_features = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
                   'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
                   'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin',
                   'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']

for train_features_seg in train_features_segements:
    # standardize continuous features
    scaler = StandardScaler()
    train_features_seg_continuous = scaler.fit_transform(train_features_seg[continuous_features])
    
    # encode categorical features
    train_features_seg_categorical = encoder.transform(train_features_seg[categorical_features])

    # binary features remain fixed
    train_features_seg_binary = train_features_seg[binary_features].values

    # integrate processed data segement into a numpy array
    train_features_seg_processed = np.hstack((train_features_seg_continuous, train_features_seg_categorical, train_features_seg_binary))
    train_features_segements_processed.append(train_features_seg_processed)

Now we have processed training features segements: `train_features_segements_processed[0]` ~ `train_features_segements_processed[7]`, which are all numpy arrays. We also have corresponding training lables segements: `train_label_1` ~ `train_label_8`, which are all dataframes.

#### 4.2 resampling and training

Resampling means over-sampling minority labels and down-sampling marjority labels.

The following codes use `SMOTE`, which is not mentioned in the paper. The goal to use it here is to get model performance on testing set first.

In [106]:
train_label_segements = [train_label_1, train_label_2, train_label_3, train_label_4,
                         train_label_5, train_label_6, train_label_7, train_label_8]

# train a logistic regression model on every data segement
pipelines = []
for i in range(len(train_label_segements)):
    pipeline = Pipeline([
        #('smote', SMOTE(random_state=42)),
        #('model', LogisticRegression(penalty='none', max_iter=50, class_weight='balanced'))
        ('model', RandomForestClassifier(max_depth=2, random_state=0))
    ])
    pipeline.fit(train_features_segements_processed[i], train_label_segements[i])
    pipelines.append(pipeline)

### 5. model performance on testing set

In section 1.3 in this notebook, the test features have been standardized and one-hot encoded.

In [107]:
type(test_features_processed), test_features_processed.shape

(numpy.ndarray, (178564, 213))

Indices of different segements can be found via the original `test_features` in section 1.2.

In [110]:
indices_1 = test_features[(test_features['ps_car_13'] <= 0.832079) & (test_features['ps_ind_06_bin'] == 1) & (test_features['ps_reg_02'] <= 0.4)].index
indices_2 = test_features[(test_features['ps_car_13'] <= 0.832079) & (test_features['ps_ind_06_bin'] == 1) & (test_features['ps_reg_02'] > 0.4)].index
indices_3 = test_features[(test_features['ps_car_13'] <= 0.832079) & (test_features['ps_ind_06_bin'] == 0) & (test_features['ps_reg_02'] <= 0.4)].index
indices_4 = test_features[(test_features['ps_car_13'] <= 0.832079) & (test_features['ps_ind_06_bin'] == 0) & (test_features['ps_reg_02'] > 0.4)].index
indices_5 = test_features[(test_features['ps_car_13'] > 0.832079) & (test_features['ps_ind_06_bin'] == 1) & (test_features['ps_reg_02'] <= 0.4)].index
indices_6 = test_features[(test_features['ps_car_13'] > 0.832079) & (test_features['ps_ind_06_bin'] == 1) & (test_features['ps_reg_02'] > 0.4)].index
indices_7 = test_features[(test_features['ps_car_13'] > 0.832079) & (test_features['ps_ind_06_bin'] == 0) & (test_features['ps_reg_02'] <= 0.4)].index
indices_8 = test_features[(test_features['ps_car_13'] > 0.832079) & (test_features['ps_ind_06_bin'] == 0) & (test_features['ps_reg_02'] > 0.4)].index
indices_set = [indices_1, indices_2, indices_3, indices_4,
               indices_5, indices_6, indices_7, indices_8]

test_label_prediction = []
test_label_reordered = []
for i in range(8):
    test_label_prediction.append(pipelines[i].predict_proba(test_features_processed[indices_set[i]])[:, 1]) # logistic regression & random forest
    test_label_reordered.append(test_label[indices_set[i]])
test_label_prediction = np.concatenate(test_label_prediction, axis=0)
test_label_reordered = np.concatenate(test_label_reordered, axis=0)

In [111]:
auc_score = roc_auc_score(test_label_reordered, test_label_prediction)
print(f"AUC: {auc_score}")

AUC: 0.6004632633856525
