In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# SELECT FEATURES & DATA CLEANING

In [None]:
import pandas as pd

In [None]:
runs = pd.read_csv("../input/hkracing/runs.csv")
runs.head()

In [None]:
races = pd.read_csv('../input/hkracing/races.csv')
races.head()

## Select features for modeling

In [None]:
runs_data = runs[['race_id', 'won', 'horse_age', 'horse_country', 'horse_type', 'horse_rating',
       'horse_gear', 'declared_weight', 'actual_weight', 'draw', 'win_odds',
       'place_odds', 'horse_id']]
runs_data.head()

In [None]:
races_data = races[['race_id', 'venue', 'config', 'surface', 'distance', 'going', 'race_class', 'date']]
races_data.head()

In [None]:
# merge the two datasets based on race_id column
df = pd.merge(runs_data, races_data)
df.head()

## Check missing values

In [None]:
df.isnull().any()

In [None]:
df.horse_country.isnull().value_counts(ascending=True)

In [None]:
df.horse_type.isnull().value_counts(ascending=True)

In [None]:
df.place_odds.isnull().value_counts(ascending=True)

The amount of rows for missing values is relatively small, therefore we decided to drop these rows. 

In [None]:
df.shape

In [None]:
df = df.dropna()
df.shape

## Basic information of the data

In [None]:
df.date = pd.to_datetime(df.date)
df.date.dtype

In [None]:
min(df.date), max(df.date)
# 8-year duration

In [None]:
start_time = min(df.date).strftime('%d %B %Y')
end_time = max(df.date).strftime('%d %B %Y')
no_of_horses = df.horse_id.nunique()
no_of_races = df.race_id.nunique()

print(f'The dataset was collected from {start_time} to {end_time}, which contains information about {no_of_horses} horses and {no_of_races} races. ')

In [None]:
# drop the unnecessary columns
df = df.drop(columns=['horse_id', 'date'])
df.head()

In [None]:
df.columns

## Impute feature

In [None]:
df.horse_gear.value_counts(ascending=False)

In [None]:
df.horse_gear.nunique()

For horse_gear column, we dicided to impute the data into 1 and 0 (with gear and no gear), rather than one-hot labeling (which will lead to numerous features). 

In [None]:
def horse_gear_impute(cols):
    if cols == '--':
        return 0
    else: 
        return 1

In [None]:
df.horse_gear = df.horse_gear.apply(horse_gear_impute)

In [None]:
df.horse_gear.value_counts()

## One-hot encoding for categorical features

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()

In [None]:
df.columns

## Features explanation:
won - whether horse won (1) or otherwise (0)<br/>
horse_age - current age of this horse at the time of the race<br/>
horse_rating - rating number assigned by HKJC to this horse at the time of the race<br/>
horse_gear - string representing the gear carried by the horse in the race. An explanation of the codes used may be found on the HKJC website.<br/>
declared_weight - declared weight of the horse and jockey, in lbs<br/>
actual_weight - actual weight carried by the horse, in lbs<br/>
draw - post position number of the horse in this race<br/>
win_odds - win odds for this horse at start of race<br/>
place_odds - place (finishing in 1st, 2nd or 3rd position) odds for this horse at start of race<br/>
surface - a number representing the type of race track surface: 1 = dirt, 0 = turf<br/>
distance - distance of the race, in metres<br/>
race_class - a number representing the class of the race<br/>
horse_country - country of origin of this horse<br/>
horse_type - sex of the horse, e.g. 'Gelding', 'Mare', 'Horse', 'Rig', 'Colt', 'Filly'<br/>
venue - a 2-character string, representing which of the 2 race courses this race took place at: ST = Shatin, HV = Happy Valley<br/>
config - race track configuration, mostly related to the position of the inside rail. For more details, see the HKJC website.<br/>
going - track condition. For more details, see the HKJC website.<br/>

# MODELING

In [None]:
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from sklearn.metrics import precision_score, classification_report, confusion_matrix

## Extract the last race data for model testing

In [None]:
last_raceid = max(df.race_id)
last_raceid

In [None]:
# split the last race data for deployment & save it in last_race variable
last_race = df[df.race_id == last_raceid]
last_race

In [None]:
new_data = df[:75696]   # drop the last race data for modeling
new_data = new_data.drop(columns='race_id')   # drop the unnecessary race_id column
new_data.tail()

In [None]:
new_data.shape

## Distribution of labels

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=new_data, x='won')
plt.title('Number of Labels by Class')

In [None]:
X = new_data.drop(columns='won')
y = new_data['won']

In [None]:
# extermely skewed data
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

## kNN Classifier (original data)

In [None]:
k_range = range(1,10)
scores = {}
scores_list = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    # precision ratio: tp / (tp + fp), aiming at minimize fp (predict: win, actual: lose)
    scores[k] = precision_score(y_test, y_pred)
    scores_list.append(precision_score(y_test, y_pred))

In [None]:
# find the highest precision score of the positive class (1)
import operator
max(scores.items(), key=operator.itemgetter(1))

In [None]:
plt.plot(k_range, scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Precision Score of the positive class (1)')
plt.title('Original Data')

In [None]:
start = time()

knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

end = time()
running_time = end - start
print('time cost: %.5f sec' %running_time)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
labels = ['lose', 'win']
cm = confusion_matrix(y_test, y_pred)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

## kNN Classifier (under-sampling)

In [None]:
rus = RandomUnderSampler(random_state=0)
X_rus, y_rus = rus.fit_sample(X_train, y_train)

k_range = range(1,10)
scores = {}
scores_list = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_rus, y_rus)
    y_pred = knn.predict(X_test)
    scores[k] = precision_score(y_test, y_pred)
    scores_list.append(precision_score(y_test, y_pred))

In [None]:
max(scores.items(), key=operator.itemgetter(1))

In [None]:
plt.plot(k_range, scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Precision Score of the positive class (1)')
plt.title('RUS Data')

In [None]:
start = time()

knn_rus = KNeighborsClassifier(n_neighbors=8)
knn_rus.fit(X_rus, y_rus)
y_pred = knn_rus.predict(X_test)

end = time()
running_time = end - start
print('time cost: %.5f sec' %running_time)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
labels = ['lose', 'win']
cm = confusion_matrix(y_test, y_pred)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

## kNN Classifier (over-sampling)

In [None]:
sm = SMOTE(random_state=0)
X_sm, y_sm = sm.fit_sample(X_train, y_train)

k_range = range(1,10)
scores = {}
scores_list = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_sm, y_sm)
    y_pred = knn.predict(X_test)
    scores[k] = precision_score(y_test, y_pred)
    scores_list.append(precision_score(y_test, y_pred))

In [None]:
max(scores.items(), key=operator.itemgetter(1))

In [None]:
# SMOTE data
plt.plot(k_range, scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Precision Score of the positive class (1)')
plt.title('SMOTE Data')

In [None]:
start = time()

knn_sm = KNeighborsClassifier(n_neighbors=2)
knn_sm.fit(X_sm, y_sm)
y_pred = knn_sm.predict(X_test)

end = time()
running_time = end - start
print('time cost: %.5f sec' %running_time)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
labels = ['lose', 'win']
cm = confusion_matrix(y_test, y_pred)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:
* Faster training speed and higher efficiency.
* Lower memory usage.
* Better accuracy.
* Support of parallel and GPU learning.
* Capable of handling large-scale data.

## LightGBM (original data)

In [None]:
start = time()

d_train = lgb.Dataset(X_train, label = y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 100
params['min_data'] = 500
params['max_depth'] = 100
clf = lgb.train(params, d_train, 100)

end = time()
running_time = end - start
print('time cost: %.5f sec' %running_time)

In [None]:
#Prediction
y_pred = clf.predict(X_test)
#convert into binary values
for i in range(15140):
    if y_pred[i] >= 0.0995:       # setting threshold 
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
labels = ['lose', 'win']
cm = confusion_matrix(y_test, y_pred)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

In [None]:
# plot model’s feature importances (original data)
lgb.plot_importance(clf, max_num_features=10)

## LightGBM (under-sampling)

In [None]:
# convert array data into dataframe with column names, and feed into lgb model
X_rus = pd.DataFrame(X_rus, columns=list(X_train))
X_rus.head()

In [None]:
start = time()

d_train = lgb.Dataset(X_rus, label = y_rus)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 100
params['min_data'] = 500
params['max_depth'] = 100
clf_rus = lgb.train(params, d_train, 100)

end = time()
running_time = end - start
print('time cost: %.5f sec' %running_time)

In [None]:
#Prediction
y_pred = clf_rus.predict(X_test)
#convert into binary values
for i in range(15140):
    if y_pred[i] >= 0.55:       # setting threshold 
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
labels = ['lose', 'win']
cm = confusion_matrix(y_test, y_pred)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

In [None]:
# plot model’s feature importances (Random Under-sampling)
lgb.plot_importance(clf_rus, max_num_features=10)

## LightGBM (over-sampling)

In [None]:
# convert array data into dataframe with column names, and feed into lgb model
X_sm = pd.DataFrame(X_sm, columns=list(X_train))
X_sm.head()

In [None]:
start = time()

d_train = lgb.Dataset(X_sm, label = y_sm)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 100
params['min_data'] = 500
params['max_depth'] = 100
clf_sm = lgb.train(params, d_train, 100)

end = time()
running_time = end - start
print('time cost: %.5f sec' %running_time)

In [None]:
#Prediction
y_pred = clf_sm.predict(X_test)
#convert into binary values
for i in range(15140):
    if y_pred[i] >= 0.5:       # setting threshold 
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
labels = ['lose', 'win']
cm = confusion_matrix(y_test, y_pred)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

In [None]:
# plot model’s feature importances (SMOTE)
lgb.plot_importance(clf_sm, max_num_features=10)

* By processing a lot of data, kNN model trained with over-sampled data took the longest time, while LightGBM model trained with under-sampled data took the shortest time. 
* kNN models performed relatively worse with low precision score and f1-score of the positive class (1). 
* Training models aimed at minimize False Positive (predict: win, actual: lose), but it seems True Positive and False Positive are correlated. Same as gambling and investment, you have the chance to win and the risk to lose at the same time.
* File sizes of LightGBM models are incredibly small and the time spent on training models is really quick.

LightGBM code reference from Medium [article](https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc) by Pushkar Mandot. Thank you for sharing your experience! =]

# DEPLOY MODELS

In [None]:
# data that never been seen by the models
last_race

In [None]:
# drop unnecessary columns & define data and labels
X_deploy = last_race.drop(columns=['race_id', 'won'])
y_deploy = last_race.won

## Load kNN model trained with original data

In [None]:
predictions = knn.predict(X_deploy)
print(classification_report(y_deploy, predictions))

Only class 0 (lose) can be predicted. 

## Load kNN model trained with under-sampled data

In [None]:
predictions = knn_rus.predict(X_deploy)
print(classification_report(y_deploy, predictions))

kNN model trained with under-sampled data can predict the winning horse. However, there is also one False Positive in the prediction. 

In [None]:
import numpy as np

data = confusion_matrix(y_deploy, predictions)

fig, ax = plt.subplots()
cax = ax.matshow(data, cmap='RdBu')

for (i, j), z in np.ndenumerate(data):
    ax.text(j, i, '{}'.format(z), ha='center', va='center',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
    
plt.title('Confusion matrix of kNN_rus', y=1.1)
fig.colorbar(cax)
labels = ['lose', 'win']
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Prediction')
plt.ylabel('Actual')

## Load kNN model trained with over-sampled data

In [None]:
predictions = knn_sm.predict(X_deploy)
print(classification_report(y_deploy, predictions))

Only class 0 (lose) can be predicted.

## Load LightGBM models & Set threshold values same as the training models

In [None]:
predictions = clf.predict(X_deploy)
#convert into binary values
for i in range(14):
    if predictions[i] >= 0.0995:       # setting threshold 
        predictions[i] = 1
    else:  
        predictions[i] = 0

In [None]:
predictions_rus = clf_rus.predict(X_deploy)
#convert into binary values
for i in range(14):
    if predictions_rus[i] >= 0.55:       # setting threshold 
        predictions_rus[i] = 1
    else:  
        predictions_rus[i] = 0

In [None]:
predictions_sm = clf_sm.predict(X_deploy)
#convert into binary values
for i in range(14):
    if predictions_sm[i] >= 0.5:       # setting threshold 
        predictions_sm[i] = 1
    else:  
        predictions_sm[i] = 0

## Predictions of the LightGBM models

In [None]:
print(classification_report(y_deploy, predictions))

In [None]:
print(classification_report(y_deploy, predictions_rus))

In [None]:
print(classification_report(y_deploy, predictions_sm))

All LightGBM models can achieve 100% accuracy rate. 

In [None]:
data = confusion_matrix(y_deploy, predictions)

fig, ax = plt.subplots()
cax = ax.matshow(data, cmap='RdBu')

for (i, j), z in np.ndenumerate(data):
    ax.text(j, i, '{}'.format(z), ha='center', va='center',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
    
plt.title('Confusion matrix of LightGBM models', y=1.1)
fig.colorbar(cax)
labels = ['lose', 'win']
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Prediction')
plt.ylabel('Actual')

## Conclusions:
For KNeighborsClassifier, only model trained with under-sampled data can predict both class 0 and class 1 (with one False Positive error). The original data model and over-sampling model can only predict class 0.  <br/>
LightGBM models can predict all data correctly, even using the model trained with skewed dataset (by tuning the threshold value). 

Confusion matrix plot code reference from [Stack Overflow](https://stackoverflow.com/questions/20998083/show-the-values-in-the-grid-using-matplotlib) user Joe Kington. Thank you for sharing your experience! =]