In [1]:
%load_ext autoreload
%matplotlib widget
%autoreload 2
import sys
sys.path.append("../")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sumo


BANZUKE_RANGE = list(range(200001, 201801))
# BANZUKE_RANGE = None
db = sumo.connect()

# Sumo Prediction Model

## Preprocessing

First, load the data from the database and preprocess into a sensible table of data in pandas.

In [2]:
banzukes = sumo.load_banzuke(db, BANZUKE_RANGE)
history_df = sumo.load_history(db, BANZUKE_RANGE)
history_df = history_df.join(banzukes, on=['bid', 'rid'])

rikishi = sumo.load_rikishi(db)
rikishi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hoshi,prizes,rank,score,weight,height
bid,rid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
198803,1,[],[],Mz,"[0, 0]",152.0,200
198805,1,"[shiro, yasumi, yasumi, shiro, yasumi, shiro, ...",[],Jk19e,"[6, 1]",152.0,200
198807,1,"[shiro, yasumi, yasumi, shiro, kuro, yasumi, y...",[],Jd97e,"[5, 2]",152.0,200
198809,1,"[yasumi, shiro, yasumi, shiro, yasumi, kuro, s...",[],Jd52e,"[5, 2]",154.5,202
198811,1,"[shiro, yasumi, shiro, yasumi, shiro, yasumi, ...",[],Jd15w,"[6, 1]",154.5,202


## Create Feature Matrix

Create a feature matrix with the following features for each prediction. All properties should be calculated excluding the "current" basho:
 - Total number of previous wins
 - Total number of previous losses
 - Average number of absent days
 - Average number of losses.
 - Average number of wins.
 - Average number of absent days
 - Sliding window average wins
 - Sliding window average losses
 - Sliding window average absent days
 - Current ranking

In [3]:
from sumo import elo

def create_feature_matrix(df):

    def career_total(x, column):
        return x.groupby(['rid'])[column].apply(lambda x: x.shift().fillna(0).cumsum())
    
    def cum_basho_total(x, column):
        return x.groupby(['bid', 'rid'])[column].apply(lambda x: x.shift().fillna(0).cumsum())
    
    def last_value(x, column):
        return x.groupby('rid')[column].apply(lambda x: x.shift().fillna(0))
    
    return (df.sort_values(['bid', 'day', 'rid'])
               .reset_index()
               .assign(career_wins=lambda x: career_total(x, 'result'),
                       current_basho_wins=lambda x: cum_basho_total(x, 'result'),
                       last_result=lambda x: last_value(x, 'result'),
                       num_rank=lambda x: pd.Categorical(x['rank'].values, categories=sumo.SUMO_RANKS, ordered=True).codes,
                      )
                .set_index(['bid', 'day', 'rid'])
           )
features = create_feature_matrix(history_df)

features = (features.join(rikishi, on=['bid', 'rid'], rsuffix='_r')
             .assign(weight=lambda x: x.weight.astype(float).fillna(value=x.weight.astype(float).mean()),
                     height=lambda x: x.height.astype(float).fillna(value=x.height.astype(float).mean()))
            )

features = features.join(features, on=['bid', 'day', 'opponent'], rsuffix='_opp', how='right')
features.index = features.index.set_names(['bid', 'day', 'rid'])

def get_mask(x):
    x.reset_index(inplace=True)
    bout_rikishi = x[['rid', 'opponent']]
    sorted_values = np.sort(bout_rikishi.values, axis=1)
    return ~pd.DataFrame(sorted_values, index=x.index).duplicated()

# remove duplicate pairs like:
# > rikishi1 - rikishi2
# > rikishi2 - rikishi1
features = (features
                .groupby(['bid', 'day'])
                .apply(lambda x: x.loc[get_mask(x)])
                .drop(['day', 'bid'], axis=1)
                .reset_index()
                .set_index(['bid', 'day', 'rid'])
                .drop(['level_2'], axis=1)
             )
features = features.dropna()

features = elo.rankings(features, 'rid', 'opponent', 'result')
features = features.assign(elo_diff=lambda x: x.elo - x.elo_opp,
                           rank_diff=lambda x: x.num_rank - x.num_rank_opp,
                           weight_diff=lambda x: x.weight - x.weight_opp,
                           height_diff=lambda x: x.height - x.height_opp,
                           BMI=lambda x: x.weight / ((x.height/100.) **2.),
                           BMI_opp=lambda x: x.weight_opp / ((x.height_opp/100.) **2.),
                           BMI_diff=lambda x: x.BMI - x.BMI_opp,
                           wins_last_basho_diff=lambda x: x.wins_last_basho - x.wins_last_basho_opp,
                           absent_last_basho_diff=lambda x: x.absent_last_basho - x.absent_last_basho_opp,
                           result=lambda x: x.result.astype(float)
                          )
features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,kimarite,opponent,result,rank,shikona,wins,loss,absent,wins_last_basho,absent_last_basho,...,elo_opp,elo_diff,rank_diff,weight_diff,height_diff,BMI,BMI_opp,BMI_diff,wins_last_basho_diff,absent_last_basho_diff
bid,day,rid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
200001,1,1,40.0,29,1.0,Y,Akebono,11.0,4.0,0.0,0.0,0.0,...,1000.000000,0.000000,-4.0,81.500000,20.500000,55.659859,44.492221,11.167638,0.0,0.0
200001,1,2,74.0,38,1.0,Y,Takanohana,12.0,3.0,0.0,0.0,0.0,...,1000.000000,0.000000,-4.0,-21.000000,-3.000000,43.844202,48.314896,-4.470694,0.0,0.0
200001,1,4,74.0,7,1.0,Y,Musashimaru,2.0,2.0,11.0,0.0,0.0,...,1000.000000,0.000000,-2.0,55.500000,7.000000,62.527127,51.132213,11.394913,0.0,0.0
200001,1,5,40.0,12,1.0,S,Takanonami,10.0,5.0,0.0,0.0,0.0,...,1000.000000,0.000000,-4.0,41.500000,21.100000,43.853340,41.558104,2.295236,0.0,0.0
200001,1,6,74.0,34,1.0,S,Musoyama,13.0,2.0,0.0,0.0,0.0,...,1000.000000,0.000000,-3.0,29.700000,8.000000,50.803403,45.938791,4.864611,0.0,0.0
200001,1,8,33.0,14,1.0,O,Chiyotaikai,9.0,6.0,0.0,0.0,0.0,...,1000.000000,0.000000,-4.0,-15.000000,-9.000000,48.114503,47.808985,0.305519,0.0,0.0
200001,1,9,29.0,25,1.0,M4,Akinoshima,7.0,8.0,0.0,0.0,0.0,...,1000.000000,0.000000,-1.0,38.500000,-4.000000,51.102041,36.827814,14.274226,0.0,0.0
200001,1,10,9.0,26,1.0,M4,Tochinonada,6.0,9.0,0.0,0.0,0.0,...,1000.000000,0.000000,1.0,15.500000,4.500000,46.506667,44.193616,2.313051,0.0,0.0
200001,1,11,8.0,33,0.0,M6,Aogiyama,4.0,10.0,1.0,0.0,0.0,...,1000.000000,0.000000,-1.0,40.000000,-3.500000,45.837792,32.432432,13.405359,0.0,0.0
200001,1,13,74.0,31,1.0,K,Tosanoumi,8.0,7.0,0.0,0.0,0.0,...,1000.000000,0.000000,2.0,0.000000,5.400000,46.248121,49.055137,-2.807016,0.0,0.0


# Examining Features

In [4]:
plt.close("all")
fig, ax = plt.subplots(figsize=(15,7))
features.boxplot(by='rank', column=['elo'], ax=ax)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x7f2436603e48>

In [22]:
fig, ax = plt.subplots(figsize=(15,7))
features.plot(by='day', kind='hist', y='rank_diff', ax=ax, bins=30)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x7f2436bee9b0>

# Model Training & Prediction

In [167]:
drop_columns = ['shikona', 'rank', 'wins', 'loss', 'absent', 'result', 'opponent', 'score', 'rank_r', 'hoshi', 'prizes']
drop_columns += [name + '_opp' for name in drop_columns]
# mat = mat.dropna()
mat = features.drop(drop_columns, axis=1)

from sklearn import preprocessing

X = mat.values
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

y = features.result.values.astype(int)
mat.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,kimarite,wins_last_basho,absent_last_basho,career_wins,current_basho_wins,last_result,num_rank,weight,height,kimarite_opp,...,last_result_opp,num_rank_opp,weight_opp,height_opp,elo,elo_opp,elo_diff,rank_diff,wins_last_basho_diff,absent_last_basho_diff
bid,day,rid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
192701,1,3652,103.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,103,...,0,9,0,0,1000.0,1000.0,0.0,-9.0,0.0,0.0
192701,1,3663,106.0,0.0,0.0,0.0,0.0,0.0,18.0,0,0,106,...,0,15,0,0,1000.0,1000.0,0.0,3.0,0.0,0.0
192701,1,3666,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,77,...,0,11,0,0,1000.0,1000.0,0.0,-11.0,0.0,0.0
192701,1,3668,93.0,0.0,0.0,0.0,0.0,0.0,6.0,0,0,93,...,0,12,0,0,1000.0,1000.0,0.0,-6.0,0.0,0.0
192701,1,3671,85.0,0.0,0.0,0.0,0.0,0.0,7.0,0,0,85,...,0,3,0,0,1000.0,1000.0,0.0,4.0,0.0,0.0


In [164]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def baseline_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    cls = RandomForestClassifier(max_depth=4)
    cls.fit(X_train, y_train)
    y_hat = cls.predict(X_test)
    y_prob = cls.predict_proba(X_test)
    return y_test, y_hat

def nn_model(X, y):
    from keras.models import Sequential
    from keras.layers import Dense, Activation, Embedding, LSTM
    import keras 

    X_train, X_test, y_train, y_test = train_test_split(X, y)
        
    model = Sequential()
    n_features = X_train.shape[1]
    model.add(Dense(n_features, input_dim=n_features, activation='relu'))
    model.add(Dense(n_features, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

   
# results = baseline_model(X, y)

nn_model(X, y)




Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 20)                500       
_________________________________________________________________
dense_4 (Dense)              (None, 15)                315       
_________________________________________________________________
dense_5 (Dense)              (None, 10)                160       
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 55        
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 6         
Total para

In [163]:


def model_summary(y_test, y_hat):
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_hat, y_test)
    f1 = f1_score(y_hat, y_test)
    
    print("Accuracy is: {}".format(accuracy))
    print("F1 score is: {}".format(f1))
#     plt.scatter(y_prob[:, 1], y_test, alpha=0.01)
    
model_summary(*results)

Accuracy is: 0.5873408239700375
F1 score is: 0.6112483240420576
