In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import sumo

# BANZUKE_RANGE = [201701, 201703, 201705, 201707, 201709]
BANZUKE_RANGE = None
db = sumo.connect()

# Sumo Prediction Model

## Preprocessing

First, load the data from the database and preprocess into a sensible table of data in pandas.

In [None]:
banzukes = sumo.load_banzuke(db, BANZUKE_RANGE)
history_df = sumo.load_history(db, BANZUKE_RANGE)
history_df = history_df.join(banzukes, on=['bid', 'rid'])
history_df.head()

In [None]:
def load_rikishi(db):

    def parse_weight(column):
        x = column.str.replace('cm', '').str.replace('kg', '').str.split(' ', expand=True)[[2]]
        return x
        
    def parse_height(column):
        x = column.str.replace('cm', '').str.replace('kg', '').str.split(' ', expand=True)[[0]]
        return x
    
    collection = db.rikishi.find({})
    rikishi = (pd.DataFrame(list(collection))
                   .dropna(subset=['history'])
                   .groupby('_id')
                   .apply(lambda x: pd.concat([pd.DataFrame(y) for y in x['history']], axis=1))
                   .reset_index()
                   .rename({'_id': 'rid', 'banzuke': 'bid'}, axis=1)
                   .assign(rid=lambda x: x.rid.astype(int), 
                           bid=lambda x: x.bid.astype(int))
                   .set_index(['bid', 'rid'])
                   .sort_values(['rid', 'bid'])
                   .drop('level_1', axis=1)
                   .groupby('rid').apply(lambda x: x.assign(weight=lambda y: y.weight.bfill().ffill()))
                   .assign(height=lambda x: parse_height(x.weight))
                   .assign(weight=lambda x: parse_weight(x.weight))
              )
    return rikishi

rikishi = load_rikishi(db)
rikishi.head()

## Create Feature Matrix

Create a feature matrix with the following features for each prediction. All properties should be calculated excluding the "current" basho:
 - Total number of previous wins
 - Total number of previous losses
 - Average number of absent days
 - Average number of losses.
 - Average number of wins.
 - Average number of absent days
 - Sliding window average wins
 - Sliding window average losses
 - Sliding window average absent days
 - Current ranking

In [None]:
from sumo import elo

def create_feature_matrix(df):

    def career_total(x, column):
        return x.groupby(['rid'])[column].apply(lambda x: x.shift().fillna(0).cumsum())
    
    def cum_basho_total(x, column):
        return x.groupby(['bid', 'rid'])[column].apply(lambda x: x.shift().fillna(0).cumsum())
    
    def last_value(x, column):
        return x.groupby('rid')[column].apply(lambda x: x.shift().fillna(0))
    
    return (df.sort_values(['bid', 'day', 'rid'])
               .reset_index()
               .assign(career_wins=lambda x: career_total(x, 'result'),
                       current_basho_wins=lambda x: cum_basho_total(x, 'result'),
                       last_result=lambda x: last_value(x, 'result'),
                       num_rank=lambda x: pd.Categorical(x['rank'].values, categories=sumo.SUMO_RANKS, ordered=True).codes,
                      )
                .set_index(['bid', 'day', 'rid'])
           )
features = create_feature_matrix(history_df)

features = (features.join(rikishi, on=['bid', 'rid'], rsuffix='_r')
             .assign(weight=lambda x: x.weight.fillna(value=0),
                     height=lambda x: x.height.fillna(value=0))
            )

features = features.join(features, on=['bid', 'day', 'opponent'], rsuffix='_opp', how='right')
features.index = features.index.set_names(['bid', 'day', 'rid'])

def get_mask(x):
    x.reset_index(inplace=True)
    bout_rikishi = x[['rid', 'opponent']]
    sorted_values = np.sort(bout_rikishi.values, axis=1)
    return ~pd.DataFrame(sorted_values, index=x.index).duplicated()

# remove duplicate pairs like:
# > rikishi1 - rikishi2
# > rikishi2 - rikishi1
features = (features
                .groupby(['bid', 'day'])
                .apply(lambda x: x.loc[get_mask(x)])
                .drop(['day', 'bid'], axis=1)
                .reset_index()
                .set_index(['bid', 'day', 'rid'])
                .drop(['level_2'], axis=1)
             )
features = features.dropna()

features = elo.rankings(features, 'rid', 'opponent', 'result')
features = features.assign(elo_diff=lambda x: x.elo - x.elo_opp,
                           rank_diff=lambda x: x.num_rank - x.num_rank_opp,
                           wins_last_basho_diff=lambda x: x.wins_last_basho - x.wins_last_basho_opp,
                           absent_last_basho_diff=lambda x: x.absent_last_basho - x.absent_last_basho_opp,
                          )
features

# Examining Features

In [None]:
features.plot(kind='bar', x='rank', y='wins')

rank
Y      AxesSubplot(0.125,0.125;0.775x0.755)
O      AxesSubplot(0.125,0.125;0.775x0.755)
S      AxesSubplot(0.125,0.125;0.775x0.755)
K      AxesSubplot(0.125,0.125;0.775x0.755)
M1     AxesSubplot(0.125,0.125;0.775x0.755)
M2     AxesSubplot(0.125,0.125;0.775x0.755)
M3     AxesSubplot(0.125,0.125;0.775x0.755)
M4     AxesSubplot(0.125,0.125;0.775x0.755)
M5     AxesSubplot(0.125,0.125;0.775x0.755)
M6     AxesSubplot(0.125,0.125;0.775x0.755)
M7     AxesSubplot(0.125,0.125;0.775x0.755)
M8     AxesSubplot(0.125,0.125;0.775x0.755)
M9     AxesSubplot(0.125,0.125;0.775x0.755)
M10    AxesSubplot(0.125,0.125;0.775x0.755)
M11    AxesSubplot(0.125,0.125;0.775x0.755)
M12    AxesSubplot(0.125,0.125;0.775x0.755)
M13    AxesSubplot(0.125,0.125;0.775x0.755)
M14    AxesSubplot(0.125,0.125;0.775x0.755)
M15    AxesSubplot(0.125,0.125;0.775x0.755)
M16    AxesSubplot(0.125,0.125;0.775x0.755)
dtype: object

# Model Training & Prediction

In [167]:
drop_columns = ['shikona', 'rank', 'wins', 'loss', 'absent', 'result', 'opponent', 'score', 'rank_r', 'hoshi', 'prizes']
drop_columns += [name + '_opp' for name in drop_columns]
# mat = mat.dropna()
mat = features.drop(drop_columns, axis=1)

from sklearn import preprocessing

X = mat.values
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

y = features.result.values.astype(int)
mat.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,kimarite,wins_last_basho,absent_last_basho,career_wins,current_basho_wins,last_result,num_rank,weight,height,kimarite_opp,...,last_result_opp,num_rank_opp,weight_opp,height_opp,elo,elo_opp,elo_diff,rank_diff,wins_last_basho_diff,absent_last_basho_diff
bid,day,rid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
192701,1,3652,103.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,103,...,0,9,0,0,1000.0,1000.0,0.0,-9.0,0.0,0.0
192701,1,3663,106.0,0.0,0.0,0.0,0.0,0.0,18.0,0,0,106,...,0,15,0,0,1000.0,1000.0,0.0,3.0,0.0,0.0
192701,1,3666,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,77,...,0,11,0,0,1000.0,1000.0,0.0,-11.0,0.0,0.0
192701,1,3668,93.0,0.0,0.0,0.0,0.0,0.0,6.0,0,0,93,...,0,12,0,0,1000.0,1000.0,0.0,-6.0,0.0,0.0
192701,1,3671,85.0,0.0,0.0,0.0,0.0,0.0,7.0,0,0,85,...,0,3,0,0,1000.0,1000.0,0.0,4.0,0.0,0.0


In [164]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def baseline_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    cls = RandomForestClassifier(max_depth=4)
    cls.fit(X_train, y_train)
    y_hat = cls.predict(X_test)
    y_prob = cls.predict_proba(X_test)
    return y_test, y_hat

def nn_model(X, y):
    from keras.models import Sequential
    from keras.layers import Dense, Activation, Embedding, LSTM
    import keras 

    X_train, X_test, y_train, y_test = train_test_split(X, y)
        
    model = Sequential()
    n_features = X_train.shape[1]
    model.add(Dense(n_features, input_dim=n_features, activation='relu'))
    model.add(Dense(n_features, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

   
# results = baseline_model(X, y)

nn_model(X, y)




Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 20)                500       
_________________________________________________________________
dense_4 (Dense)              (None, 15)                315       
_________________________________________________________________
dense_5 (Dense)              (None, 10)                160       
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 55        
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 6         
Total para

In [163]:
import matplotlib.pyplot as plt
%matplotlib widget

def model_summary(y_test, y_hat):
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_hat, y_test)
    f1 = f1_score(y_hat, y_test)
    
    print("Accuracy is: {}".format(accuracy))
    print("F1 score is: {}".format(f1))
#     plt.scatter(y_prob[:, 1], y_test, alpha=0.01)
    
model_summary(*results)

Accuracy is: 0.5873408239700375
F1 score is: 0.6112483240420576
