In [1]:
%matplotlib ipympl
from glob import glob
import pickle
import pandas as pd
import numpy as np


bout_file_names = sorted(glob("./data/bouts_*.pkl"))
bout_data = {}
for name in bout_file_names:
    date = name.split('_')[1][:-4]
    with open(name, 'rb') as f:
        bout_data[date] = pickle.load(f)
        
raw_bouts = list(filter(lambda item: len(item[1].keys()) > 0, bout_data.items()))

banzuke_file_names = sorted(glob("./data/banzuke_*.pkl"))
banzuke_data = {}
for name in banzuke_file_names:
    date = name.split('_')[1][:-4]
    with open(name, 'rb') as f:
        banzuke_data[date] = pickle.load(f)
        
raw_banzuke = list(filter(lambda item: len(item[1]) > 0, banzuke_data.items()))

In [2]:
rikishi_file_names = sorted(glob("./data/rikishi_*.pkl"))
rikishi_data = []
for name in rikishi_file_names:
    with open(name, 'rb') as f:
        rikishi_data.append(pickle.load(f))
raw_rikishi = list(filter(lambda item: len(item) > 0, rikishi_data))

In [3]:
def parse_prize(prizes, prize_name):
    for prize in prizes:
        if prize_name in prize:
            return True
    return False

def parse_kinboshi(prizes):
    kinboshi_count = 0
    for prize in prizes:
        if 'Kinboshi' in prize:
            parts = prize.split(' ')
            if len(parts) > 1:
                kinboshi_count += int(parts[0])
            else:
                kinboshi_count += 1
    return kinboshi_count

def parse_yusho(prizes):
    for prize in prizes:
        if prize_name in prize and '-' not in prize:
            return True
    return False

import re
rank_regex = re.compile(r'(Mz|Bg|Kg|([A-Za-z]+)([0-9]+)[A-Za-z]+)')

def parse_rank(rank):
    matches = re.match(rank_regex, rank)
    if matches is None:
        print(rank)
    g = matches.groups()
    if g[1] is not None and g[1] in ['Y', 'O', 'K', 'S']:
        return g[1]
    elif g[1] is not None and g[2] is not None:
        return g[1] + g[2]
    else:
        return g[0]
    
def parse_rank_ordered(rank):
    matches = re.match(rank_regex, rank)
    if matches is None:
        print(rank)
    g = matches.groups()
    if g[1] is not None and g[1] in ['Y', 'O', 'K', 'S']:
        if g[1] == 'Y':
            return 1
        elif g[1] == 'O':
            return 2
        elif g[1] == 'S':
            return 3
        elif g[1] == 'K':
            return 4
    elif g[1] == 'M' and g[2] is not None:
        if 0 < int(g[2]) <= 5:
            return 5
        elif 5 < int(g[2]) <= 10:
            return 6
        elif 10 < int(g[2]) <= 18:
            return 7
    else:
        return 8
    
def parse_rikishi_data(rikishi):
    name = rikishi['name']
    rikishi_id = rikishi['id']
    birth_date = rikishi['Birth Date']
    birth_date = pd.to_datetime(" ". join(birth_date.split(' ')[:3]))
    
    shusshin = rikishi['Shusshin']
#     print(rikishi)
#     intai = rikishi['Intai'].replace('.', '')
    heya = rikishi['Heya']
    hatsu_dohyo = rikishi['Hatsu Dohyo'].replace(".", '')
    hatsu_dohyo = hatsu_dohyo.split(" ")[0]

    rikishi_info = [name, rikishi_id, birth_date, shusshin, heya, hatsu_dohyo]
    
    rikishi_history = pd.DataFrame(rikishi['history'])
    rikishi_history['name'] = rikishi['name']
    rikishi_history['rikishi_id'] = rikishi['id']
    rikishi_history['tournament'] = rikishi_history['date'].str.replace('.', '')

    rikishi_history['size'] = rikishi_history['size'].replace('', np.nan)
    
    if not rikishi_history['size'].isnull().all():
        rikishi_history['size'] = rikishi_history['size'].bfill()
        size_info = rikishi_history['size'].str.split(' ', expand=True)

        rikishi_history['height'] = size_info[0].astype(float)
        rikishi_history['weight'] = size_info[2].astype(float)
    else:
        rikishi_history['height'] = rikishi_history['size'].fillna(0)
        rikishi_history['weight'] = rikishi_history['size'].fillna(0)
    
    rikishi_history['kanto'] = rikishi_history['prizes'].apply(lambda x: parse_prize(x, 'Kanto-sho'))
    rikishi_history['shukun'] = rikishi_history['prizes'].apply(lambda x: parse_prize(x, 'Shukun-sho'))
    rikishi_history['jun_yusho'] = rikishi_history['prizes'].apply(lambda x: parse_prize(x, 'Jun-Yusho'))
    rikishi_history['kinboshi'] = rikishi_history['prizes'].apply(parse_kinboshi)
    rikishi_history['rank_ordered'] = rikishi_history['rank'].apply(parse_rank_ordered)
    rikishi_history['rank'] = rikishi_history['rank'].apply(parse_rank)
    
    rikishi_history.drop(['size', 'prizes', 'date'], axis=1, inplace=True)
    
    return rikishi_info, rikishi_history
    
        
    
all_infos = []
all_history = []

for rikishi in rikishi_data:
    info, history = parse_rikishi_data(rikishi)
    all_infos.append(info)
    all_history.append(history)
    

all_infos = pd.DataFrame(all_infos, columns=['name', 'rikishi_id', 'birth_date', 'shusshin', 'heya', 'hatsu_dohyo'])
all_history = pd.concat(all_history)
all_infos

Unnamed: 0,name,rikishi_id,birth_date,shusshin,heya,hatsu_dohyo
0,Akebono,1,1969-05-08,"U.S.A., Hawaii, Oahu",Azumazeki,198803
1,Tochinonada,10,1974-02-26,"Ishikawa-ken, Nanao-shi",Kasugano,199601
2,Asasekiryu,1009,1981-08-07,"Mongolia, Ulan-Bator",Wakamatsu - Takasago,200001
3,Hakuba,1011,1983-05-05,"Mongolia, Ulan-Bator",Tatsutagawa - Michinoku,200001
4,Shimotori,1018,1978-03-18,"Niigata-ken, Arai-shi",Tokitsukaze,200005
5,Ryuo,1031,1983-03-11,"Mongolia, Ulan-Bator",Miyagino,200003
6,Hochiyama,1032,1982-01-18,"Aomori-ken, Hirosaki-shi",Nakadachi - Sakaigawa,200003
7,Buyuzan,105,1974-07-29,"Aichi-ken, Toyohashi-shi",Musashigawa,199703
8,Kasuganishiki,107,1975-08-22,"Chiba-ken, Isumi-gun, Misaki-machi - Chiba-ken...",Kasugano,199103
9,Iwakiyama,1087,1976-03-02,"Aomori-ken, Nakatsugaru-gun, Iwaki-cho - Aomor...",Nakadachi - Sakaigawa,200007


In [4]:
def result_to_numeric(result):
     return 1 if result  == 'shiro' else 0

In [5]:
from collections import defaultdict

bout_df = []
ratings = defaultdict(lambda: 1000)

for date, tournament in raw_bouts:
    
    for name, bouts in tournament.items():
        for bout in bouts:
            
            row = {
                'name': name, 
                'tournament': date,
                'opponent': bout['opponent'],
                'day': bout['day'],
                'result': result_to_numeric(bout['result'])
            }
            bout_df.append(row)
            
bout_df = pd.DataFrame(bout_df)
print(bout_df.shape)
bout_df.head(5)

(105088, 5)


Unnamed: 0,day,name,opponent,result,tournament
0,1,Tochinishiki,Wakanoumi,1,196001
1,2,Tochinishiki,Wakamaeda,1,196001
2,3,Tochinishiki,Iwakaze,1,196001
3,4,Tochinishiki,Aonosato,1,196001
4,5,Tochinishiki,Kitanonada,1,196001


In [6]:
from collections import defaultdict
career_wins = defaultdict(int)
career_losses = defaultdict(int)
career_absent = defaultdict(int)

last_wins = defaultdict(int)
last_losses = defaultdict(int)
last_absent = defaultdict(int)

banzuke_features_by_years = []

for date, banzuke in raw_banzuke:
    for rikishi_data in banzuke:
        name = rikishi_data['name']
        
        row = {
            'name': name,
            'tournament': date,
            'career_wins': career_wins[name],
            'career_losses': career_losses[name],
            'career_absent': career_absent[name],
            'last_wins': last_wins[name],
            'last_losses': last_losses[name],
            'last_absent': last_absent[name]
        }

        last_wins[name] = rikishi_data['wins']
        last_losses[name] = rikishi_data['losses']
        last_absent[name] = rikishi_data['absent']
        
        career_wins[name] += rikishi_data['wins']
        career_losses[name] += rikishi_data['losses']
        career_absent[name] += rikishi_data['absent']
        
        banzuke_features_by_years.append(row)
        
banzuke_features_by_years = pd.DataFrame(banzuke_features_by_years)
banzuke_features_by_years.tail(5)

Unnamed: 0,career_absent,career_losses,career_wins,last_absent,last_losses,last_wins,name,tournament
7408,0,124,101,0,10,5,Chiyomaru,201809
7409,0,97,83,0,11,4,Chiyoshoma,201809
7410,9,560,526,0,13,2,Yoshikaze,201809
7411,24,188,163,1,13,1,Kotoyuki,201809
7412,0,80,70,0,8,7,Ishiura,201809


In [13]:
from collections import defaultdict
import elo
import importlib
importlib.reload(elo)

def result_to_numeric(result):
     return 1 if result  == 'shiro' else 0

def mean_regression(ratings, weight=.8):
    for name, r in ratings.items():
        ratings[name] = elo.mean_regression(r, c=weight)
    return ratings

elos = []
ratings = defaultdict(lambda: 1000)
for keys, tournament in bout_df.groupby('tournament'):
    ratings = mean_regression(ratings, weight=.9)

    for key, day in tournament.groupby('day'):
        new_ratings = ratings.copy()
        
        for index, bout in day.iterrows():
            current_rating = ratings[bout['name']]
            opponent_rating = ratings[bout['opponent']]
            
            likelihood = elo.expected_outcome(current_rating, opponent_rating)
            elo_row = {
                'name': bout['name'],
                'tournament': bout['tournament'],
                'day': bout['day'],
                'elo': current_rating,
                'elo_likelihood': likelihood,
            }
                

            result = bout['result']
            new_ratings[bout['name']] += elo.update_rating_sigmoid(current_rating, opponent_rating, result, k=20.)
            
            elos.append(elo_row)
            
        ratings = new_ratings

elos = pd.DataFrame(elos)

In [14]:

best_win_streak = defaultdict(int)

streak_data = []
for keys, tournament in bout_df.groupby(['tournament']):
    tournament_win_streak = defaultdict(int)
    tournament_wins = defaultdict(int)
    tournament_losses = defaultdict(int)
    previous_result = defaultdict(lambda: -1)
    
    for key, bouts in tournament.groupby(['day']):
        for key, bout in bouts.iterrows():
            name = bout['name']
            result = bout['result']
            
            row = {
                'name': name,
                'tournament': bout['tournament'],
                'day': bout['day'],
                'best_win_streak': best_win_streak[name],
                'tournament_win_streak': tournament_win_streak[name],
                'current_tournament_wins': tournament_wins[name],
                'previous_result': previous_result[name],
                'on_7_margin': int(tournament_wins[name] == 7),
                '7_and_7': int(tournament_wins[name] == 7 and tournament_losses[name] == 7),
            }

            streak_data.append(row)

            if result > 0:
                tournament_win_streak[name] += 1
                tournament_wins[name] += 1
            else:
                tournament_win_streak[name] = 0
                tournament_losses[name] += 1
            
            previous_result[name] = result

            best_win_streak[name] = max(tournament_win_streak[name], best_win_streak[name])

streak_data = pd.DataFrame(streak_data)
streak_data

Unnamed: 0,7_and_7,best_win_streak,current_tournament_wins,day,name,on_7_margin,previous_result,tournament,tournament_win_streak
0,0,0,0,1,Tochinishiki,0,-1,196001,0
1,0,0,0,1,Wakanohana,0,-1,196001,0
2,0,0,0,1,Asashio,0,-1,196001,0
3,0,0,0,1,Wakahaguro,0,-1,196001,0
4,0,0,0,1,Kotogahama,0,-1,196001,0
5,0,0,0,1,Annenyama,0,-1,196001,0
6,0,0,0,1,Kitabayama,0,-1,196001,0
7,0,0,0,1,Dewanishiki,0,-1,196001,0
8,0,0,0,1,Kashiwado,0,-1,196001,0
9,0,0,0,1,Tamanoumi,0,-1,196001,0


In [15]:
bout_data = bout_df.merge(banzuke_features_by_years, on=['name', 'tournament'])
bout_data = bout_data.merge(elos, on=['name', 'tournament', 'day'])
bout_data = bout_data.merge(streak_data, on=['name', 'tournament', 'day'])

bout_data['year'] = bout_data['tournament'].str[:4]
bout_data['month'] = bout_data['tournament'].str[4:]
bout_data['date'] = bout_data['year'] + '-' + bout_data['month'] + '-' + bout_data['day'].astype(str)
bout_data['date'] = pd.to_datetime(bout_data['date'])


opponent_data = ['name', 'tournament', 'day', 'career_wins', 'career_losses', 'career_absent', 
                 'last_wins', 'last_losses', 'last_absent', 'elo', 'elo_likelihood', 'best_win_streak', 
                 'tournament_win_streak', 'current_tournament_wins', 'previous_result',
                 'on_7_margin', '7_and_7']
bout_data = bout_data.merge(bout_data[opponent_data], 
                            left_on=['opponent', 'tournament', 'day'], right_on=['name', 'tournament', 'day'],
                            suffixes=('', '_opponent'))

# bout_features.merge(bout_data[['name', 'career_wins', 'career_losses', 'elo']], left_on ='opponent', right_on='name', suffixes=('', '_opponent'))
# bout_data = bout_data.merge(all_infos[['name', 'rikishi_id', 'shusshin']], on=['name'])

bout_data = bout_data.merge(all_history[['rikishi_id', 'tournament', 'name', 'rank', 'rank_ordered', 'height', 'weight']],
                            left_on=['name', 'tournament'], right_on=['name', 'tournament'])

bout_data = bout_data.merge(all_history[['rikishi_id', 'tournament', 'name', 'rank', 'rank_ordered', 'height', 'weight']],
                            left_on=['opponent', 'tournament'], right_on=['name', 'tournament'],
                           suffixes=('', '_opponent'))

bout_data = bout_data.merge(all_infos[['rikishi_id', 'shusshin', 'birth_date']], 
                           left_on=['rikishi_id'], right_on=['rikishi_id'])
bout_data = bout_data.merge(all_infos[['rikishi_id', 'shusshin', 'birth_date']], 
                           left_on=['rikishi_id_opponent'], right_on=['rikishi_id'],
                           suffixes=('', '_opponent'))

bout_data['elo_diff'] = bout_data['elo'] - bout_data['elo_opponent']
bout_data['weight_diff'] = bout_data['weight'] - bout_data['weight_opponent']
bout_data['height_diff'] = bout_data['height'] - bout_data['height_opponent']

bout_data['BMI'] = bout_data['weight'] / np.square(bout_data['height'] / 100.)
bout_data.loc[bout_data.height == 0, 'BMI'] = 0
bout_data['BMI_opponent'] = bout_data['weight_opponent'] / np.square(bout_data['height_opponent'] / 100.)
bout_data.loc[bout_data.height_opponent == 0, 'BMI_opponent'] = 0
bout_data['BMI_diff'] = bout_data['BMI'] - bout_data['BMI_opponent']

bout_data['rank_ordered_diff'] = bout_data['rank_ordered'] - bout_data['rank_ordered_opponent']


bout_data['total_exp'] = bout_data['career_wins'] + bout_data['career_losses']
bout_data['total_exp_opponent'] = bout_data['career_wins_opponent'] + bout_data['career_losses_opponent']
bout_data['total_exp_diff'] = bout_data['total_exp'] - bout_data['total_exp_opponent']

bout_data['win_per_bout'] = bout_data['career_wins'] / bout_data['total_exp']
bout_data.loc[bout_data.total_exp == 0, 'win_per_bout'] = 0
bout_data['win_per_bout_opponent'] = bout_data['career_wins_opponent'] / bout_data['total_exp_opponent']
bout_data.loc[bout_data.total_exp_opponent == 0, 'win_per_bout_opponent'] = 0

bout_data['age_diff'] = bout_data['birth_date'] - bout_data['birth_date_opponent']
bout_data['age_diff'] = bout_data['age_diff'].dt.days
bout_data['age_diff'] = bout_data['age_diff'].astype(int)

bout_data.drop(['name_opponent', 'birth_date', 'birth_date_opponent'], axis=1, inplace=True)
bout_data.tail(5)

Unnamed: 0,day,name,opponent,result,tournament,career_absent,career_losses,career_wins,last_absent,last_losses,...,BMI,BMI_opponent,BMI_diff,rank_ordered_diff,total_exp,total_exp_opponent,total_exp_diff,win_per_bout,win_per_bout_opponent,age_diff
97663,8,Daieisho,Amakaze,1,201609,0,41,34,0,10,...,42.351987,59.221408,-16.869422,0,75,0,75,0.453333,0.0,857
97664,4,Kagayaki,Amakaze,1,201609,0,19,11,0,8,...,38.739295,59.221408,-20.482113,0,30,0,30,0.366667,0.0,1060
97665,5,Daishomaru,Amakaze,0,201609,0,21,24,0,8,...,49.959184,59.221408,-9.262225,-1,45,0,45,0.533333,0.0,3
97666,9,Nishikigi,Amakaze,1,201609,0,14,16,0,6,...,43.918596,59.221408,-15.302812,-1,30,0,30,0.533333,0.0,-316
97667,1,Chiyoshoma,Amakaze,1,201609,0,0,0,0,0,...,35.295172,59.221408,-23.926237,0,0,0,0,0.0,0.0,13


In [16]:
bout_data.to_pickle("bout_data.pkl")

In [23]:
total_num_wins = bout_df.loc[bout_df['result'] == 1].shape[0]
total_num_loss = bout_df.loc[bout_df['result'] == 0].shape[0]

print("Total number of wins:   {}".format(total_num_wins))
print("Total number of losses: {}".format(total_num_loss))
print("Win/Loss class ratio:   {}".format(total_num_wins/total_num_loss))

Total number of wins:   49400
Total number of losses: 50214
Win/Loss class ratio:   0.9837893814474051


In [29]:
from sklearn.preprocessing import LabelEncoder

sample = banzuke_history.copy()
sample.head()

rikishi = np.unique(sample.name)
name_encoder = LabelEncoder()
name_encoder.fit(rikishi)
idx = name_encoder.transform(sample.name)
print(idx)


num_rikishi = len(rikishi)

results = sample.wins
sample.head()

[  0   0   0 ... 213 182 182]


Unnamed: 0,rank,rank_debut,score,name,rikishi_id,tournament,height,weight,kanto,shukun,...,career_losses,career_wins,last_absent,last_losses,last_wins,total_exp,win_per_bout,day,elo,elo_likelihood
0,M14,1.0,9-6,Akebono,1,199009,204.0,185.0,False,False,...,0,0,0,0,0,0,0.0,1,1000.0,0.567526
1,M7,0.0,9-6,Akebono,1,199011,204.0,185.0,True,False,...,6,9,0,6,9,15,0.6,1,1013.420523,0.56303
2,M1,0.0,8-7,Akebono,1,199101,204.0,185.0,False,True,...,12,18,0,6,9,30,0.6,1,1026.693461,0.422776
3,K,1.0,8-7,Akebono,1,199103,204.0,185.0,False,True,...,19,26,0,7,8,45,0.577778,1,1029.059075,0.32048
4,S,1.0,7-8,Akebono,1,199105,204.0,194.0,False,False,...,26,34,0,7,8,60,0.566667,1,1044.267547,0.645489


## Banzuke History for Regression Analysis

In [17]:
banzuke_history = all_history.copy()

# Correct score into proper features
scores = banzuke_history.score.str.split(' ').apply(lambda x: x[0])
scores[scores.str.count('-') == 1] = scores[scores.str.count('-') == 1].apply(lambda x: x + '-0')
scores = scores.str.split('-', expand=True)
banzuke_history['wins'] = scores[0].astype(int)
banzuke_history['loss'] = scores[1].astype(int)
banzuke_history['absent'] = scores[2].str.replace('d', '').astype(int)

banzuke_history['BMI'] = banzuke_history['weight'] / np.square(banzuke_history['height'] / 100.)
banzuke_history.loc[banzuke_history.height == 0, 'BMI'] = 0

banzuke_history.rank_debut[banzuke_history.rank_debut == True] = 1
banzuke_history.rank_debut[banzuke_history.rank_debut == False] = 0

banzuke_history = banzuke_history.merge(banzuke_features_by_years, on=['name', 'tournament'])

banzuke_history['total_exp'] = banzuke_history['career_wins'] + banzuke_history['career_losses']

banzuke_history['win_per_bout'] = banzuke_history['career_wins'].astype(float) / banzuke_history['total_exp'].astype(float)
banzuke_history.loc[banzuke_history.total_exp == 0, 'win_per_bout'] = 0

banzuke_history = banzuke_history.merge(elos.loc[elos.day == 1], on=['name', 'tournament'])

banzuke_history.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,rank,rank_debut,score,name,rikishi_id,tournament,height,weight,kanto,shukun,...,career_losses,career_wins,last_absent,last_losses,last_wins,total_exp,win_per_bout,day,elo,elo_likelihood
0,M14,1.0,9-6,Akebono,1,199009,204.0,185.0,False,False,...,0,0,0,0,0,0,0.0,1,1000.0,0.581934
1,M7,0.0,9-6,Akebono,1,199011,204.0,185.0,True,False,...,6,9,0,6,9,15,0.6,1,1013.908066,0.579074
2,M1,0.0,8-7,Akebono,1,199101,204.0,185.0,False,True,...,12,18,0,6,9,30,0.6,1,1029.353289,0.396911
3,K,1.0,8-7,Akebono,1,199103,204.0,185.0,False,True,...,19,26,0,7,8,45,0.577778,1,1036.30487,0.264644
4,S,1.0,7-8,Akebono,1,199105,204.0,194.0,False,False,...,26,34,0,7,8,60,0.566667,1,1056.943117,0.682833
5,M1,0.0,8-7,Akebono,1,199107,204.0,194.0,False,False,...,34,41,0,8,7,75,0.546667,1,1039.272298,0.342259
6,K,0.0,7-8,Akebono,1,199109,204.0,194.0,False,False,...,41,49,0,7,8,90,0.544444,1,1056.073721,0.575293
7,M1,0.0,8-7,Akebono,1,199111,204.0,194.0,False,False,...,49,56,0,8,7,105,0.533333,1,1043.456172,0.495591
8,K,0.0,13-2 J,Akebono,1,199201,204.0,194.0,True,True,...,56,64,0,7,8,120,0.533333,1,1052.405413,0.502486
9,S,0.0,8-7,Akebono,1,199203,204.0,194.0,False,False,...,58,77,0,2,13,135,0.57037,1,1119.673017,0.707713


In [18]:
banzuke_history.to_pickle("banzuke_data.pkl")

In [138]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler, Binarizer

X_all = banzuke_history[banzuke_history.absent == 0]
X_all = X_all.drop(['wins', 'loss', 'absent', 'tournament', 'name', 'rikishi_id', 'score', 
                              'kanto', 'shukun', 'jun_yusho', 'kinboshi', 'elo_likelihood'], axis=1)

# name_encoder = LabelEncoder()
# all_names = pd.concat((X_all['name'], X_all['opponent']))
# name_encoder.fit(all_names)
# X_all['name'] = name_encoder.transform(X_all['name'])
# X_all['opponent'] = name_encoder.transform(X_all['opponent'])

rank_encoder = LabelEncoder()
rank_encoder.fit(X_all['rank'])
X_all['rank'] = rank_encoder.transform(X_all['rank'])

# shusshin_encoder = LabelEncoder()
# all_shusshin = pd.concat((X_all['shusshin'], X_all['shusshin_opponent']))
# shusshin_encoder.fit(all_shusshin)
# X_all['shusshin'] = shusshin_encoder.transform(X_all['shusshin'])
# X_all['shusshin_opponent'] = shusshin_encoder.transform(X_all['shusshin_opponent'])

# normalizer = StandardScaler()
# X_all = pd.DataFrame(normalizer.fit_transform(X_all), columns=X_all.columns)

y_all = banzuke_history.loc[banzuke_history.absent==0]['wins']

print(X_all.shape)
X_all.head()

(6226, 16)


Unnamed: 0,rank,rank_debut,height,weight,rank_ordered,BMI,career_absent,career_losses,career_wins,last_absent,last_losses,last_wins,total_exp,win_per_bout,day,elo
0,6,1.0,204.0,185.0,7,44.454056,0,0,0,0,0,0,0,0.0,1,1000.0
1,15,0.0,204.0,185.0,6,44.454056,0,6,9,0,6,9,15,0.6,1,1013.420523
2,1,0.0,204.0,185.0,5,44.454056,0,12,18,0,6,9,30,0.6,1,1026.693461
3,0,1.0,204.0,185.0,4,44.454056,0,19,26,0,7,8,45,0.577778,1,1029.059075
4,19,1.0,204.0,194.0,3,46.616686,0,26,34,0,7,8,60,0.566667,1,1044.267547


In [362]:
hakuho = banzuke_history[banzuke_history.name=='Hakuho']
hakuho = hakuho[['elo', 'win_per_bout', 'rank_ordered', 'wins', 'height', 'weight']]
# normalizer = StandardScaler()
# hakuho = pd.DataFrame(normalizer.fit_transform(hakuho), columns=hakuho.columns)
hakuho.sample(5)

Unnamed: 0,elo,win_per_bout,rank_ordered,wins,height,weight
458,1162.965995,0.807771,1,12,191.5,152.2
462,1170.959419,0.823315,1,15,192.0,153.8
430,1084.978516,0.72,3,8,191.5,141.0
454,1147.014307,0.787879,1,14,191.5,152.2
484,1139.019756,0.847554,1,13,193.0,150.7
