# Elo Rating History

In [31]:
%matplotlib ipympl
import pandas as pd
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
    
plt.figure(figsize=(10, 5))
ax = plt.subplot()

def plot_elo(bout_df, name, ax, **kwargs):
    rikishi = bout_df.loc[bout_df.name == name]
    rikishi = rikishi.loc[rikishi.date >= pd.datetime(2010, 1, 1)]
    rikishi = rikishi.sort_values(axis=0, by='date')
    for i, g in rikishi.groupby('tournament'):
        ax.plot(g.date, g.elo.values,  marker='', label=g.name.unique()[0], **kwargs)

    years = mdates.YearLocator()   # every year
    months = mdates.MonthLocator()  # every month
    yearsFmt = mdates.DateFormatter('%Y')
    
    # format the ticks
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(yearsFmt)
    ax.xaxis.set_minor_locator(months)

    ax.set_ylabel("Elo Rating")
    ax.set_xlabel("Basho")


patches = []
ax.set_title('Elo History for Tochinoshin')
for name, c in [('Tochinoshin', 'purple')]:
    plot_elo(bout_df, name, ax, color=c)
    patches.append(mpatches.Patch(color=c, label=name))
ax.legend(handles=patches)



FigureCanvasNbAgg()

<matplotlib.legend.Legend at 0x7f79399052b0>

In [348]:
plt.savefig("img/tochinoshin.png")

In [294]:
number_of_bouts = bout_df[bout_df.date > pd.datetime(1990, 1, 1)].shape[0]
number_of_rikishi = bout_df[bout_df.date > pd.datetime(1990, 1, 1)].name.unique().shape[0]

print(number_of_bouts, number_of_rikishi)

97632 262


In [317]:
import random
aki_basho = ['Kakuryu', 'Hakuho', 'Kisenosato', 'Goeido', 'Takayasu', 'Tochinoshin', 'Mitakeumi', 
              'Ichinojo', 'Tamawashi', 'Takakeisho', 'Ikioi', 'Kaisei', 'Yutakayama', 'Chiyotairyu', 
              'Shodai', 'Endo', 'Chiyonokuni', 'Abi', 'Myogiryu', 'Asanoyama']

aki_basho += ['Kagayaki', 'Onosho', 'Shohozan', 'Tochiozan', 'Takarafuji', 'Kotoshogiku', 'Hokutofuji', 'Daishomaru', 
             'Aoiyama', 'Daiesho', 'Sadanoumi', 'Kyokutaisei', 'Okinoumi', 'Nishikigi', 'Ryuden', 'Takanoiwa', 'Takanosho', 
             'Chiyomaru', 'Chiyoshoma', 'Yoshikaze', 'Kotoyuki', 'Ishiura']


aki_rikishi_bout_history = bout_df.loc[bout_df.name.isin(aki_basho)]
aki_rikishi_bout_history.sort_values('date', inplace=True)

last_rating = pd.concat([aki_rikishi_bout_history[aki_rikishi_bout_history.name == name].tail(1).loc[:, ['name', 'elo']]
               for name in aki_basho])
last_rating = last_rating.set_index('name')
last_rating = last_rating.sort_values('elo', ascending=False)

ratings = defaultdict(lambda: 1000)
for row in last_rating.iterrows():
    ratings[row[0]] = row[1].elo
    
# def generate_opponents(names):
#     bouts = {}
#     NUM_DAYS = 15
#     for name in names:
#         opponents = list(filter(lambda n: n != name, names))
#         bouts[name] = np.random.choice(opponents, replace=False, size=NUM_DAYS)
#     return bouts

def pop_random(lst):
    idx = random.randrange(0, len(lst))
    return lst.pop(idx)

def generate_random_bouts(all_names):
    NUM_DAYS = 15
    bouts = []

    history = defaultdict(list)
    
    for day in range(NUM_DAYS):
        day_names = all_names.copy()
        while len(day_names) > 0:
            name = day_names.pop()
            
            opponent = np.random.choice(list(filter(lambda x: x not in history[x] and x != name, day_names)), size=1)[0]
            day_names.remove(opponent)
            
            bout = name, opponent
            bouts.append(bout)
            
            history[name].append(opponent)
            history[opponent].append(name)
      
    return bouts

def simulate_bouts(bouts, ratings):
    score_dist = defaultdict(int)
    for name, opponent in bouts:
        current_rank = ratings[name]
        opponent_rank = ratings[opponent]
        p = elo.expected_outcome(current_rank, opponent_rank)
        outcome = 1 if np.random.random() < p else 0
        if outcome == 1:
            score_dist[name] += 1
            score_dist[opponent] += 0
        else:
            score_dist[opponent] += 1
            score_dist[name] += 0

    return score_dist
    

def simulate_tournament(names, ratings, samples=100000):
    score_dists = []
    for i in range(samples):
#         bouts = generate_opponents(names)
#         score_dist = simulate_single_tournament(bouts, ratings)
        bouts = generate_random_bouts(names)
        score_dist = simulate_bouts(bouts, ratings)
        score_dists.append(pd.Series(score_dist))
        if i % 1000 == 0:
            print("Simulating tournament: {}".format(i))
    return sum(score_dists) / samples

outcome = simulate_tournament(aki_basho, ratings, samples=1000)
outcome

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


NameError: name 'defaultdict' is not defined

In [363]:
import re
rank_regex = re.compile(r'([A-Z])([0-9]*)')

aki_basho = ['Kakuryu', 'Hakuho', 'Kisenosato', 'Goeido', 'Takayasu', 'Tochinoshin', 'Mitakeumi', 
              'Ichinojo', 'Tamawashi', 'Takakeisho', 'Ikioi', 'Kaisei', 'Yutakayama', 'Chiyotairyu', 
              'Shodai', 'Endo', 'Chiyonokuni', 'Abi', 'Myogiryu', 'Asanoyama']

aki_basho += ['Kagayaki', 'Onosho', 'Shohozan', 'Tochiozan', 'Takarafuji', 'Kotoshogiku', 'Hokutofuji', 'Daishomaru', 
             'Aoiyama', 'Daiesho', 'Sadanoumi', 'Kyokutaisei', 'Okinoumi', 'Nishikigi', 'Ryuden', 'Takanoiwa', 'Takanosho', 
             'Chiyomaru', 'Chiyoshoma', 'Yoshikaze', 'Kotoyuki', 'Ishiura']

aki_ranks = ['Y', 'Y', 'Y', 'O', 'O', 'O', 'S', 'S', 'K', 'K', 'M1', 'M1', 'M2', 'M2', 'M3', 'M3', 'M4', 'M4', 'M4', 'M5',
             'M5', 'M6', 'M6', 'M7', 'M7', 'M8', 'M8', 'M9', 'M9', 'M10', 'M10', 'M11', 'M11', 'M12', 'M13', 'M13', 'M14',
             'M14', 'M15', 'M15', 'M16', 'M16']

def parse_rank_coarse(rank):
    matches = re.match(rank_regex, rank)
    g = matches.groups()
    if g[0] in ['Y', 'O', 'K', 'S']:
        if g[0] == 'Y':
            return 1
        elif g[0] == 'O':
            return 1
        elif g[0] == 'S':
            return 2
        elif g[0] == 'K':
            return 2
    elif g[0] == 'M' and g[1] is not None:
        if 0 < int(g[1]) <= 5:
            return 3
        elif 5 < int(g[1]) <= 10:
            return 4
        elif 10 < int(g[1]) <= 18:
            return 5
    else:
        return 6
    
banzuke = pd.DataFrame(list(zip(aki_basho, aki_ranks)), columns=['name', 'rank'])
banzuke['rank_coarse'] = banzuke['rank'].apply(parse_rank_coarse)
banzuke = banzuke.set_index('name')
# banzuke['predicted_wins'] = outcome
# banzuke = banzuke.sort_values(['rank_coarse', 'predicted_wins'], ascending=[True, False])
# banzuke.predicted_wins = banzuke.predicted_wins.round(2)

# banzuke['elo'] = ratings.values()
# banzuke.elo = banzuke.elo.round(2)
banzuke.groupby('rank_coarse').head(1)
# banzuke
# print(banzuke[['rank', 'elo', 'predicted_wins']].to_csv())
# outcome.sort_values(ascending=False)

Unnamed: 0_level_0,rank,rank_coarse
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kakuryu,Y,1
Mitakeumi,S,2
Ikioi,M1,3
Onosho,M6,4
Kyokutaisei,M11,5


In [32]:
plt.figure(figsize=(10, 10))
ax = plt.subplot()
    
def plot_by_rank(df, name, ax):
    labels = ['Y', 'O', 'S', 'K', 'M1', 'M2', 'M3']
    gs = [g[name] for i, g in df.groupby('rank_ordered')]
    ax.hist(gs, density=True, label=labels)
    ax.legend()
    return ax
    
plot_by_rank(bout_df, 'elo', ax)
ax.set_title('Elo by Rank')

FigureCanvasNbAgg()

Text(0.5,1,'Elo by Rank')

In [60]:
plt.figure(figsize=(10, 10))
ax = plt.subplot()

def violinplot_by_rank(df, name, ax):
    data = np.array([g[name].values for i, g in df.groupby('wins')])

    ax.violinplot(
            data, showmeans=True, showmedians=True,
            showextrema=True, vert=False)
    labels = ['Y', 'O', 'S', 'K', 'M1', 'M2', 'M3']
    labels = np.arange(len(data))
    ax.set_yticks(np.arange(1, len(labels)+1))
    ax.set_yticklabels(labels)

violinplot_by_rank(banzuke_df, 'elo', ax)
ax.set_title("Wins by Rank")
ax.set_xlabel('elo')
ax.set_ylabel('wins')



FigureCanvasNbAgg()

Text(0,0.5,'wins')

In [288]:
df = banzuke_df.copy()
df = df.loc[df.name == 'Akebono']
plt.figure()
plt.scatter(df['elo'], df['wins'], c=df['rank_ordered'])

FigureCanvasNbAgg()

<matplotlib.collections.PathCollection at 0x7f79069114a8>

In [284]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
df = banzuke_df.copy()
df = df.loc[df.absent == 0]
df['elo'] = MinMaxScaler().fit_transform(np.atleast_2d(df['elo'].values).T)
df.rank_ordered = df.rank_ordered - 1
# df['wins'] = MinMaxScaler().f it_transform(np.atleast_2d(df['wins'].values).T)
ranks = df.rank_ordered.unique() 
num_ranks = len(ranks)


X_train, X_test, y_train, y_test = train_test_split(df.drop('wins', axis=1), df['wins'])
rank_index = X_train['rank_ordered']

df_test = X_test
df_test['wins'] = y_test

df_train = X_train
df_train['wins'] = y_train

In [285]:
import pymc3 as pm


with pm.Model() as mdl_ols:

    ## Define weakly informative Normal priors to give Ridge regression
    b0 = pm.Normal('a', mu=5, sd=1, shape=num_ranks)
    b1 = pm.Normal('b', mu=0, sd=1, shape=num_ranks)

    ## Define linear model
    yest = b0[rank_index] + b1[rank_index] * X_train['elo']

    ## Define Normal likelihood
    likelihood = pm.StudentT('likelihood', nu=5, mu=yest, sd=1, observed=y_train)
    
with mdl_ols:
    ## take samples
    traces_ols = pm.sample()
    
plt.figure()
_ = pm.traceplot(traces_ols, figsize=(12, len(traces_ols.varnames)*1.5),
                lines={k: v['mean'] for k, v in pm.summary(traces_ols[-1000:]).iterrows()})

INFO (theano.gof.compilelock): Refreshing lock /home/jovyan/.theano/compiledir_Linux-4.9-linuxkit-aufs-x86_64-with-debian-buster-sid-x86_64-3.6.3-64/lock_dir/lock
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [b, a]
Sampling 2 chains: 100%|██████████| 2000/2000 [01:16<00:00, 26.27draws/s]


FigureCanvasNbAgg()

In [286]:
from sklearn.metrics import mean_squared_error

plt.close('all')

def _model(x, c_index):
    return traces_ols['a'][samples_greater_than:, c_index].mean() + traces_ols['b'][samples_greater_than:, c_index].mean() * x
        
def mse_summary(df):
    total_mse = 0
    for i, g in df.groupby('rank_ordered'):
        rank_name = g['rank'].values[0]
        c_index = g.rank_ordered.values[0]
        y_hat = model(g.elo, c_index)
        y = g.wins

        mse = mean_squared_error(y, y_hat)
        total_mse += mse
        print("Mean squared error for rank {} ({}): {:.2f}".format(c_index, rank_name, mse))

    total_mse /= len(ranks)
    print("Average MSE: {:.2f}".format(total_mse))
    
def plot_model(df):
    fig, axis = plt.subplots(num_ranks, 1, figsize=(10, 20), sharey=True, sharex=True)
    axis = axis.ravel()

    samples_greater_than = 500

    for i, c in enumerate(ranks):
        c_data = df[df.rank_ordered == c]
        c_data = c_data.reset_index(drop = True)
        c_index = c

        xvals = np.linspace(0, 1)

        for a_val, b_val in zip(traces_ols['a'][samples_greater_than:, c_index], traces_ols['b'][samples_greater_than:, c_index]):
            axis[i].plot(xvals, a_val + b_val * xvals, 'b', alpha=.1)

        axis[i].scatter(c_data.elo, c_data.wins,
                        alpha=1, color='k', marker='.', s=80, label='original data')

        axis[i].plot(xvals, _model(xvals, c_index), 'b', alpha=1, lw=2., color='r', label='Mean')

        axis[i].set_xticks([0,1])

        axis[i].set_ylim(0, 17)
        axis[i].set_title(c)
        axis[i].set_xlabel('elo')

plot_model(df_train)
mse_summary(df_test)

FigureCanvasNbAgg()

Mean squared error for rank 0 (Y): 2.52
Mean squared error for rank 1 (O): 3.31
Mean squared error for rank 2 (S): 4.66
Mean squared error for rank 3 (K): 5.34
Mean squared error for rank 4 (M3): 5.33
Mean squared error for rank 5 (M6): 4.09
Mean squared error for rank 6 (M16): 4.58
Average MSE: 4.26


In [368]:
def _model(x, c_index):
    return traces_ols['a'][samples_greater_than:, c_index].mean() + traces_ols['b'][samples_greater_than:, c_index].mean() * x

last_rating = pd.concat([aki_rikishi_bout_history[aki_rikishi_bout_history.name == name].tail(1).loc[:, ['name', 'elo']]
               for name in aki_basho])
last_rating = last_rating.set_index('name')
last_rating = last_rating.sort_values('elo', ascending=False)

    
banzuke['last_rating'] = last_rating
banzuke.last_rating = banzuke.last_rating.fillna(1000)
banzuke['last_rating'] = MinMaxScaler().fit_transform(np.atleast_2d(banzuke['last_rating'].values).T)
banzuke['rank_ordered'] = banzuke['rank_coarse']

scores = pd.DataFrame(_model(banzuke.last_rating, banzuke.rank_ordered))

scores = scores.sort_values(['rank', 'last_rating'], ascending=[True, False])
scores.last_rating = scores.last_rating.round(2)
scores

# scores.groupby('rank_ordered').head(1)

KeyError: 'rank'

In [142]:
elo_dist = bout_df.groupby('rank_ordered').elo.describe()
elo_dist.index = labels
elo_dist

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Y,4682.0,1117.137216,35.852449,989.762415,1098.344392,1114.618128,1141.906696,1212.418483
O,8852.0,1075.574999,34.806938,959.728177,1052.024849,1076.809657,1102.404578,1195.255526
S,5365.0,1057.801734,33.56775,944.703022,1035.450537,1057.593626,1078.698004,1158.87003
K,5012.0,1032.489069,27.420801,934.253641,1014.649371,1033.443128,1049.875085,1134.84235
M1,23620.0,1005.826466,28.271118,892.284864,987.040662,1005.24023,1024.112259,1121.397508
M2,23683.0,987.956176,25.661609,884.569185,970.683161,987.172193,1004.249061,1105.319571
M3,24148.0,977.547762,26.93716,863.928551,960.429834,977.992731,995.165397,1085.404247


In [35]:
banzuke_df = pd.read_pickle("banzuke_data.pkl")
banzuke_df.head()

Unnamed: 0,rank,rank_debut,score,name,rikishi_id,tournament,height,weight,kanto,shukun,...,career_losses,career_wins,last_absent,last_losses,last_wins,total_exp,win_per_bout,day,elo,elo_likelihood
0,M14,1.0,9-6,Akebono,1,199009,204.0,185.0,False,False,...,0,0,0,0,0,0,0.0,1,1000.0,0.581934
1,M7,0.0,9-6,Akebono,1,199011,204.0,185.0,True,False,...,6,9,0,6,9,15,0.6,1,1013.908066,0.579074
2,M1,0.0,8-7,Akebono,1,199101,204.0,185.0,False,True,...,12,18,0,6,9,30,0.6,1,1029.353289,0.396911
3,K,1.0,8-7,Akebono,1,199103,204.0,185.0,False,True,...,19,26,0,7,8,45,0.577778,1,1036.30487,0.264644
4,S,1.0,7-8,Akebono,1,199105,204.0,194.0,False,False,...,26,34,0,7,8,60,0.566667,1,1056.943117,0.682833


In [34]:
plt.figure(figsize=(8, 5))
ax = plt.subplot()

rikishi = bout_df.loc[bout_df['name'] == 'Kisenosato']

def make_record_table(rikishi):
    rikishi = rikishi.drop_duplicates(['tournament', 'day'])
    record = rikishi.pivot(index='tournament', columns='day', values='result')
    record.fillna(-1, inplace=True)
    return record

def plot_rikishi_record(rikishi, ax, index=-1):
    record = make_record_table(rikishi)
    record = record.tail(index)

    ax.set_xticklabels(record.columns)
    ax.set_yticklabels(record.index)

    ax.set_xticks(np.arange(len(record.columns)))
    ax.set_yticks(np.arange(len(record.index)))

    ax.set_xticks(np.arange(-.5,len(record.columns)), minor=True)
    ax.set_yticks(np.arange(-.5, len(record.index)), minor=True)

    ax.set_xlabel('Day')
    ax.set_ylabel('Basho')

    ax.grid(False)
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
    im = ax.imshow(record.values, cmap='Pastel1_r')
    
    import matplotlib.patches as mpatches

    values = np.unique(record.values.ravel())
    labels = values.copy().astype(str)
    labels[labels == '0.0'] = 'Loss'
    labels[labels == '-1.0'] = 'Absent'
    labels[labels == '1.0'] = 'Win'
    colors = [ im.cmap(im.norm(value)) for value in values]

    patches = [ mpatches.Patch(color=colors[i], label="{}".format(labels[i]) , edgecolor='b' ) for i in range(len(values)) ]

    ax.legend(handles=patches, bbox_to_anchor=(1.05, 1),loc=2, borderaxespad=0.5, frameon=True)

plot_rikishi_record(rikishi, ax, index=10)

FigureCanvasNbAgg()



In [282]:
yokozuna_names = bout_df.loc[bout_df['rank'] == 'Y'].name.unique()
yokozuna_names = filter(lambda n: n not in ['Hakuho', 'Kisenosato', 'Kakuryu'], yokozuna_names)
yokozuna_bouts = [bout_df.loc[bout_df['name'] == name] for name in yokozuna_names]
data = map(lambda x: make_record_table(x).tail(5).replace(-1, 0).sum(axis=1).reset_index(drop=True), yokozuna_bouts)
data = pd.concat(data, axis=1).T

yokozuna_names = bout_df.loc[bout_df['rank'] == 'Y'].name.unique()
yokozuna_names = list(filter(lambda n: n not in ['Hakuho', 'Kisenosato', 'Kakuryu'], yokozuna_names))
data.index = yokozuna_names


from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

kmeans = KMeans(n_clusters=3)
kmeans.fit(data.values)


plt.figure()
ax = plt.subplot()


for i, row in enumerate(data.values):
    cd = {0: 'red', 1: 'blue', 2: 'green'}
    c = cd[kmeans.labels_[i]]
    ax.plot(np.arange(5)+1, row, c=c)    

data['group'] = kmeans.labels_
print (data)
    
kinesato_bouts = [bout_df.loc[bout_df['name'] == "Kisenosato"]]
kinesato_data = map(lambda x: make_record_table(x).tail(5).replace(-1, 0).sum(axis=1).reset_index(drop=True), kinesato_bouts)
kinesato_data = np.array(list(kinesato_data))

print(kmeans.predict(kinesato_data))

ax.scatter(np.arange(5)+1, kinesato_data, c='lightblue')
ax.set_xticks(np.arange(5)+1)



FigureCanvasNbAgg()

                0     1     2     3     4  group
Chiyonofuji  13.0  12.0  13.0   2.0   1.0      1
Hokutoumi    12.0  13.0   8.0   2.0   0.0      1
Onokuni       8.0  10.0  10.0  12.0   2.0      0
Asahifuji    10.0  13.0   7.0   2.0   0.0      1
Akebono      12.0  13.0  13.0  13.0  14.0      2
Takanohana   15.0  12.0  13.0  12.0   4.0      0
Musashimaru  10.0  13.0   4.0   2.0   3.0      1
Wakanohana   13.0   5.0   3.0   7.0   2.0      1
Asashoryu    12.0  10.0  15.0  11.0  13.0      2
Harumafuji   10.0  10.0  11.0  12.0   0.0      0
[1]


[<matplotlib.axis.XTick at 0x17c1a4be0>,
 <matplotlib.axis.XTick at 0x17c1a4550>,
 <matplotlib.axis.XTick at 0x17c1a4438>,
 <matplotlib.axis.XTick at 0x17c276e10>,
 <matplotlib.axis.XTick at 0x17c2762e8>]

In [50]:
print(banzuke_df.columns)
subset = banzuke_df[['elo', 'wins', 'rank_ordered']]

import seaborn as sns
sns.pairplot(subset, diag_kind='kde', hue='wins')

Index(['rank', 'rank_debut', 'score', 'name', 'rikishi_id', 'tournament',
       'height', 'weight', 'kanto', 'shukun', 'jun_yusho', 'kinboshi',
       'rank_ordered', 'wins', 'loss', 'absent', 'BMI', 'career_absent',
       'career_losses', 'career_wins', 'last_absent', 'last_losses',
       'last_wins', 'total_exp', 'win_per_bout', 'day', 'elo',
       'elo_likelihood'],
      dtype='object')


FigureCanvasNbAgg()

<seaborn.axisgrid.PairGrid at 0x7f79334b6080>