In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.preprocessing import PowerTransformer, MultiLabelBinarizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve, auc

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow import keras

data_folder = '../input/DontGetKicked'
model_folder = './model'

if not os.path.exists(model_folder):
    os.mkdir(model_folder)

In [None]:
data = pd.read_csv(data_folder + '/training.csv',index_col = 'RefId')
data['PurchDate'] = pd.to_datetime(data['PurchDate'])
# set y_train
y_train = data['IsBadBuy'].values
# List of processed column for train
X_train, f_train = [], []

print(f'Number of rows: {len(y_train)}, kicked auction: {(y_train==1).sum()}/{len(y_train)}')

## 0. External enrichment of zipcode level data

We introduce zipcode level household income data and population density data to featurize the zipcode. This external data is useful when zipcodes in test dataset didn't exist in training data. Such features link the similarity between zipcode area.

In [None]:
zip_income = pd.read_csv(data_folder + '/MedianZIP-2010.csv')
zip_income['MedianHouseholdIncome'] = zip_income['MedianHouseholdIncome'].str.replace(',','').astype(float)
zip_pop = pd.read_csv(data_folder + '/pop_density_2010.csv')
# join external data of zipcode income and population
data = data.merge(zip_income, left_on = 'VNZIP1',right_on = 'Zip/ZCTA',how = 'left').drop(columns = ['Zip/ZCTA'])
data = data.merge(zip_pop, left_on = 'VNZIP1',right_on = 'Zip/ZCTA',how = 'left').drop(columns = ['Zip/ZCTA'])

# 1. Columnwise Feature Engineering

## 1.1 Numerical Column

In [None]:
# numerical features
quant_fs = ['VehicleAge','VehOdo','MMRAcquisitionAuctionAveragePrice',
            'MMRAcquisitionAuctionCleanPrice','MMRAcquisitionRetailAveragePrice',
            'MMRAcquisitonRetailCleanPrice','MMRCurrentAuctionAveragePrice',
            'MMRCurrentAuctionCleanPrice','MMRCurrentRetailCleanPrice','VehBCost','WarrantyCost',
            'MedianHouseholdIncome','Pop2010','Area','PopDensity']

fig,ax = plt.subplots(4,4, figsize = (16,10))
for i, col in enumerate(quant_fs):
    ax[i//4, i%4].hist(data[col],bins = 20)
    ax[i//4, i%4].set_title(col)
fig.tight_layout()

### Yeo-Johnson scaler and feature correlation

In [None]:
# minimum imputation
X = data[quant_fs]
yj_scaler = PowerTransformer()
X = yj_scaler.fit_transform(X)

fig, ax = plt.subplots(figsize = (8,8))
ax.imshow(np.corrcoef(np.nan_to_num(X,0).T))
ax.set_yticks(range(len(quant_fs)))
ax.set_yticklabels(quant_fs)
ax.set_xticks(range(len(quant_fs)))
ax.set_xticklabels(quant_fs,rotation = 90)
fig.tight_layout()

### Iterative PCA for imputation

In [None]:
# value for imputation
imp = IterativeImputer(max_iter=20, random_state=0)
X = imp.fit_transform(X)

print(f'Number of numerical features: {X.shape[1]}')

# add features
X_train.append(X)
f_train += quant_fs

## 1.2 Categorized Columns

### 1.2.1 Car Feature Extraction

In [None]:
X_cat = []

def extract_car_features(row, feature_list = None):
    # Sync Feature string
    f_str = ' '.join([str(row['Make']), str(row['Trim']), str(row['Model']), str(row['SubModel']),str(row['Size'])]).upper()
    # replace annoying string
    f_str = f_str.replace('&','').replace('/','')
    f_list = f_str.split(' ')
    if feature_list is None:
        return [c for c in list(set(f_list)) if c!='']
    else:
        return [c for c in list(set(f_list)) if c in feature_list]
    

# multiple encoding
data['car_features'] = data.apply(extract_car_features, axis = 1)

m_bin = MultiLabelBinarizer()
X_car = m_bin.fit_transform(data['car_features'])
car_features = m_bin.classes_

print(f'Number of features assotiate to car: {len(car_features)}')

# append features
X_cat.append(X_car)

### 1.2.2 One-hot encode dense feature columns

In [None]:
dense_cols = ['TopThreeAmericanName','Nationality','Auction',
              'Transmission','WheelType','PRIMEUNIT','AUCGUART','IsOnlineSale']

# One-hot encoding
df_dense = pd.get_dummies(data[dense_cols])

# append features
X_train.append(df_dense.values)
f_train += list(df_dense.columns)

dense_cols_encoded = df_dense.columns

print(f'Number of Encoded Dense features: {len(dense_cols_encoded)}')

### 1.2.3 Embedding Sparse Column

In [None]:
sparse_cols = ['Color','BYRNO','VNZIP1','VNST']
data['BYRNO'] = data['BYRNO'].astype(str)
data['VNZIP1'] = data['VNZIP1'].astype(str)

# Onhot encoding for future embedding
df_other = pd.get_dummies(data[sparse_cols])
# list of cols encoded
sparse_cols_encoded = list(df_other.columns)
X_cat.append(df_other.values)

X_sparse = np.hstack(X_cat)

print(f'Total Features for embedding: {X_sparse.shape[1]}')

#### Train encoder decoder

In [None]:
# encoder
def autoencoder(dims, n_feature):
    # Flatten inputs
    input_layer = keras.layers.Input(shape=(n_feature,),name='Input')
    x = input_layer
    
    # encoder sequence
    for i, n in enumerate(dims[:-1]):
        x = keras.layers.Dense(n, activation='relu', name=f'encoder_{i}')(x)

    # Encoded layer
    encoded = keras.layers.Dense(dims[-1],activity_regularizer=keras.regularizers.l1(1e-5),
                                 name='encoded')(x) 
    x = encoded
    
    # decoder sequence
    decoder_dims = dims[::-1]
    for i, n in enumerate(decoder_dims[1:]):
        x = keras.layers.Dense(n, activation='relu', name=f'decoder_{i}')(x)

    # output
    x = keras.layers.Dense(n_feature, name='decoded')(x)
    decoded = x
    return (keras.models.Model(inputs=input_layer, outputs=decoded, name='autoencoder'), 
            keras.models.Model(inputs=input_layer, outputs=encoded, name='encoder'))

# encoder dimensions 
dims = [256,64,16]
encoder_decoder, encoder = autoencoder(dims,X_sparse.shape[1])

# training specifications
optimizer = keras.optimizers.Adam(lr=2e-4)
encoder_decoder.compile(optimizer= optimizer, loss='mse')

keras.utils.plot_model(encoder_decoder, rankdir='LR',show_shapes =True,to_file='encoder_decoder.png')

In [None]:
# Train auto encoder
history = encoder_decoder.fit(X_sparse, X_sparse, batch_size=128, epochs=50, 
                              validation_split= 0.1, verbose = 1)
# save model
encoder_decoder.save(model_folder + '/autoencoder')
encoder.save(model_folder + '/encoder')

In [None]:
fig,ax = plt.subplots(figsize=(6,4))
ax.plot(history.history['loss'],label = 'Training')
ax.plot(history.history['val_loss'],label = 'Validation')
ax.set_xlabel('Epoch')
ax.set_ylabel('Decoded MSE')
ax.legend()

#### Transform

In [None]:
# embedding layer
X_embd = encoder.predict(X_sparse, verbose=0)

# append the feature
X_train.append(X_embd)
f_train += [f'Embbeded {i}' for i in range(X_embd.shape[1])]

## 1.3 Prepare for final features

In [None]:
# Concat all features created
X_train = np.hstack(X_train)
# Check the integrity
assert len(f_train) == X_train.shape[1]

# save result
np.savez(data_folder +'/train_data.npz',X_train = X_train, y_train = y_train)

print(f'Final Features: {X_train.shape[1]}')

# 2. Training

## 2.1 SMOTE + Random Forest + cross validation

In [None]:
f1_cv,auc_cv = [], []

# Stratified k-fold validation
skf = StratifiedKFold(n_splits = 5)
fig,ax = plt.subplots(figsize = (6,6))

colors = ['tab:blue','tab:orange', 'tab:green','tab:red','tab:purple']
for i, k in enumerate(range(3,16,3)):
    # Cross validation
    f1,pr_auc = [],[]
    for t_idx, v_idx in skf.split(X_train,y_train):
        # Resample with SMOTE
        sm = SMOTE(random_state=42,n_jobs = 8)
        X_res, y_res = sm.fit_resample(X_train[t_idx,:], y_train[t_idx])
        
        # train with random foreset
        clf = RandomForestClassifier(n_estimators = 300,max_depth=k,n_jobs = 8)
        clf.fit(X_res, y_res)
        
        # Evaluate by F1 and precision-recall curve AUC
        y_pred = clf.predict_proba(X_train[v_idx,:])
        prc, rcl, _ = precision_recall_curve(y_train[v_idx],y_pred[:,1])
        ax.plot(rcl,prc,color = colors[i], alpha = 0.2)
        f1.append((2*prc*rcl/(prc + rcl+1e-6)).max())
        pr_auc.append(auc(rcl, prc))
        
    ax.plot(rcl,prc,color = colors[i], alpha = 0.3,label = f'Max Depth = {k}')
    f1_cv.append(np.mean(f1))
    auc_cv.append(np.mean(pr_auc))
    
    print(f'max_depth = {k}, CV PR-AUC: {np.mean(pr_auc):.4f}, Max F1 score: {np.mean(f1):.4f}')

## 2.2 Train with Optimal model

Choose the model with max_depth = 12 and pick the threshold with largest F1 score.

In [None]:
# Resample with SMOTE
sm = SMOTE(random_state=42,n_jobs = 8)
X_res, y_res = sm.fit_resample(X_train, y_train)

# train with random foreset
clf = RandomForestClassifier(n_estimators =300, max_depth=12, n_jobs = 8)
clf.fit(X_res, y_res)

y_pred = clf.predict_proba(X_train)
_, _, thres = precision_recall_curve(y_train,y_pred[:,1])

opt_idx = (2*prc*rcl/(prc + rcl+1e-6)).argmax()
threshold = thres[opt_idx]

print(f'Optimal prediction threshold: {thres[opt_idx]:.4f}')

# 3. Test data Prediction

In [None]:
data = pd.read_csv(data_folder +'/test.csv')
data['PurchDate'] = pd.to_datetime(data['PurchDate'])

# join external data of zipcode income and population
data = data.merge(zip_income, left_on = 'VNZIP1',right_on = 'Zip/ZCTA',how = 'left').drop(columns = ['Zip/ZCTA'])
data = data.merge(zip_pop, left_on = 'VNZIP1',right_on = 'Zip/ZCTA',how = 'left').drop(columns = ['Zip/ZCTA'])

X_test, X_cat = [],[]

# numerical data
X = data[quant_fs]
X = yj_scaler.transform(X)
X = imp.transform(X)

X_test.append(X)

# car feature encoding
data['car_features'] = data.apply(lambda x: extract_car_features(x, feature_list = m_bin.classes_), axis = 1)
X_cat.append(m_bin.transform(data['car_features']))

# One-hot encoding dense features
df_dense = pd.get_dummies(data[dense_cols]).reindex(columns = dense_cols_encoded, fill_value = 0)

# append features
X_test.append(df_dense.values)

# other spase featurs
sparse_cols = ['Color','BYRNO','VNZIP1','VNST']
data['BYRNO'] = data['BYRNO'].astype(str)
data['VNZIP1'] = data['VNZIP1'].astype(str)

# Onhot encoding for future embedding
df_other = pd.get_dummies(data[sparse_cols]).reindex(columns = sparse_cols_encoded, fill_value =0)
X_cat.append(df_other.values)

X_sparse = np.hstack(X_cat)

# embedding layer
X_embd = encoder.predict(X_sparse, verbose=0)
X_test.append(X_embd)
# Prepare final featurs
X_test = np.hstack(X_test)
# Prediction
y_proba = clf.predict_proba(X_test)
y_pred = y_proba[:,1] > 0.5
print(f'Number of positive prediction: {y_pred.sum()} ({100*y_pred.mean():.2f}%)')

In [None]:
y_pred = y_proba[:,1]
result = pd.DataFrame(columns = ['RefId','IsBadBuy'])
result['RefId'] = data['RefId']
result['IsBadBuy']= y_pred
result.to_csv('prediction.csv',index=False)

## Generate Prediction Result

Kaggle score: 0.239, rank around 110