# Предсказание бойца - победителя UFC
## Подготовка и исследование данных
### Межгалактический хакатон 2022
Команда "Meldonium" <br/>
Состав: Вячеслав Барков, Павел Мамаев, Сергей Глуховский, Алексей Недоливко, Андрей Рем, Иван Ершов


### Настроим окружение

In [24]:
import pandas as pd
import numpy as np
import json
from scipy import stats
from datetime import datetime
from typing import Dict, List
import ast
import pickle
import os

ROOT_FOLDER = '.'
DATA_FOLDER = os.path.join(ROOT_FOLDER,'data/')

### Вспомогательные функции

In [25]:
def parse_odds(row: pd.Series) -> pd.Series:
    """
    Parse 'avgOdds' column.
    :param row: Row of the events dataframe.
    :return: pd.Series with odds for the 1st and the 2nd fighters.
    """
    avg_odds = row["avgOdds"]
    if avg_odds == "[]" or avg_odds == np.nan:
        return pd.Series([np.nan] * 2)
    avg_odds = ast.literal_eval(avg_odds)
    if avg_odds[0]["fighterId"] == row["fighterId_1"]:
        return pd.Series([f.get("value", np.nan) for f in avg_odds])
    else:
        return pd.Series([f.get("value", np.nan) for f in reversed(avg_odds)])

In [26]:
fighter_stats_keys = [ "hitsTotal", "hitsSuccessful", "takedownTotal",
                        "takedownSuccessful", "submissionAttempts",
                        "takeovers", "accentedHitsTotal",
                        "accentedHitsSuccessful", "knockdowns",
                        "protectionPassage", "hitsHeadTotal",
                        "hitsHeadSuccessful", "hitsBodyTotal",
                        "hitsBodySuccessful", "hitsLegsTotal",
                        "hitsLegsSuccessful",
                        "accentedHitsPositionDistanceTotal",
                        "accentedHitsPositionDistanceSuccessful",
                        "accentedHitsPositionClinchTotal",
                        "accentedHitsPositionClinchSuccessful",
                        "accentedHitsPositionParterTotal",
                        "accentedHitsPositionParterSuccessful"]

In [27]:
def get_fighter_stats_cols(fighter_stats_keys: List[str] = fighter_stats_keys) -> List[str]:
    """
    Get list of fight stats column names for each fighter.
    :return: List of column names with 'f1_' prefix
    for the first fighter and 'f2_' prefix for the second.
    """
    fighter_attack_stats_cols = []

    for i in range(1, 3):
        for k in fighter_stats_keys:
            fighter_attack_stats_cols.append(f"f{i}_{k}")

    fighter_def_stats_cols = []
    for i in range(1, 3):
        for k in fighter_stats_keys:
            fighter_def_stats_cols.append(f"f{i}_def_{k}")

    return fighter_attack_stats_cols, fighter_def_stats_cols

In [28]:
def sum_round_stats(stats: List[Dict[str, int]],
                    fighter_stats_keys: List[str] = fighter_stats_keys) -> List[int]:
    """
    Sum stats for a fighter for all rounds of a fight.
    :param stats: List with stats from object of 'fighters' column.
    :return: Stats for all rounds for a fighter as a list.
    """
    if len(stats) == 0:
        return [np.nan for _ in range(len(fighter_stats_keys))]
    res = {k: 0 for k in fighter_stats_keys}
    for i in stats:
        for k in res:
            res[k] = i.get(k, 0)
    return list(res.values())

In [29]:
def parse_fight_data_attack(row: pd.Series,
                            fighter_stats_keys: List[str] = fighter_stats_keys) -> pd.Series:
    """
    Parse 'fighters' column.
    :param row: Row of the events dataframe.
    :return: pd.Series with stats for both fighters.
    """
    fighters = row["fighters"]
    if fighters == "[]" or fighters == np.nan:
        return pd.Series([np.nan for _ in range(len(fighter_stats_keys))])
    cols = []
    fighters = ast.literal_eval(fighters)
    if fighters[0]["fighterId"] == row["fighterId_2"]:
        fighters = reversed(fighters)
    for f in fighters:
        cols.extend(sum_round_stats(f["roundStats"]))
    return pd.Series(cols)

In [30]:
def parse_fight_data_defence(row: pd.Series,
                             fighter_stats_keys: List[str] = fighter_stats_keys) -> pd.Series:
    """
    Parse 'fighters' column.
    :param row: Row of the events dataframe.
    :return: pd.Series with stats for both fighters.
    """
    fighters = row["fighters"]
    if fighters == "[]" or fighters == np.nan:
        return pd.Series([np.nan for _ in range(len(fighter_stats_keys))])
    cols = []
    fighters = ast.literal_eval(fighters)
    if fighters[0]["fighterId"] == row["fighterId_1"]:
        fighters = reversed(fighters)
    for f in fighters:
        cols.extend(sum_round_stats(f["roundStats"]))
    return pd.Series(cols)

### Данные о бойцах

In [31]:
fighters_df = pd.read_csv(os.path.join(DATA_FOLDER, '0.fighters_raw.csv'), index_col='id')

Так как мы хотим получить модель которую можно использовать на реальных данных, необходимо исключить признаки которые содержат статистики, агрегированные на момент сбора данных чтобы исключить утечку данных. Это такие признаки, как avgFightTime, draws, knockdownsPerFight, looses, methods.*, rank и прочие.\
Также исключим признаки, которые не содержат данных, например disciplines, и признаки которые содержат так мало данных, что заполнять их пропуски бессмысленно, например признак legSwing содержится лишь у 20% бойцов.


In [32]:
fighters_df = fighters_df.loc[:, ('name', 'dateOfBirth', 'country',
                                  'city', 'timezone', 'height', 'armSpan',
                                  'weight',  'weightCategory.id')]

Рассмотрим признак country - для бойцов из США признак может содержать полное название страны "United States", сокращенное "USA", название штата напр. "Iowa" или сокращенное название штата, напр. "IL".\
Также есть странны со старыми названиями, например "Armenian SSR", "Soviet Union".\
Есть опечатки в данных, например "Floirda"\
Исправим эти данные

In [33]:
countries_usa = {np.nan, 'United States', 'IL', 'Los Angeles', 'Califorina',
                 'Saint Louis', 'Floirda', 'Alaska', 'Alabama', 'Arkansas',
                 'American Samoa', 'Arizona', 'California', 'Colorado',
                 'Connecticut', 'District ', 'of Columbia', 'Delaware',
                 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Iowa', 'Idaho',
                 'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana',
                 'Massachusetts', 'Maryland', 'Maine', 'Michigan', 'Minnesota',
                 'Missouri', 'Mississippi', 'Montana', 'North Carolina',
                 'North Dakota', 'Nebraska', 'New Hampshire', 'New Jersey',
                 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma',
                 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
                 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
                 'Utah', 'Virginia', 'Virgin Islands', 'Vermont', 'Washington',
                 'Wisconsin', 'West Virginia', 'Wyoming'}
countries_russia = {'Soviet Union', 'Rostov-On-Don'}
countries_congo = {'Democratic Republic of Congo', 'Republic of Congo', 'Democratic Republic Of The Congo'}
countries_uk = {'England', 'Scotland', 'Wales', 'Northern Ireland'}

countries_dict = {country: 'USA' for country in countries_usa}
countries_dict = {**countries_dict, **{country: 'Russia' for country in countries_russia}}
countries_dict = {**countries_dict, **{country: 'Congo' for country in countries_congo}}
countries_dict = {**countries_dict, **{country: 'United Kingdom' for country in countries_uk}}
countries_dict['Espirito Santo Brazil'] = 'Brazil'
countries_dict['Taiwain'] = 'Taiwan'
countries_dict['Bosnia'] = 'Bosnia Herzegovina'
countries_dict['Armenian SSR'] = 'Armenia'
countries_dict['Ukrainian SSR'] = 'Ukraine'
countries_dict['México'] = 'Mexico'
countries_dict['Holland'] = 'Netherlands'

In [34]:
fighters_df['country'] = fighters_df['country'].replace(countries_dict)

Рассмотрим рост и размах рук. Везде где пропущен рост пропущен и размах рук, заполнять рост размахом рук не имеет смысла. Заполним рост средним, т.к. там меньше пропусков и удалим размах рук, т.к. признаки коллинеарны, но сначала избавимся от выбросов посчитав Z-score для каждого значения

In [35]:
zscore_height = stats.zscore(fighters_df['height'], nan_policy='omit')
zscore_height = np.abs(zscore_height)
fighters_df[zscore_height > 3]

Unnamed: 0_level_0,name,dateOfBirth,country,city,timezone,height,armSpan,weight,weightCategory.id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
536,Stefan Struve,1988-02-18,Netherlands,Beverwijk,Europe/Amsterdam,210.82,213.36,120.2,9
2163,Hong Man Choi,1980-10-30,South Korea,Jeju,Asia/Seoul,218.44,,149.69,14
2553,Nathan Jones,1970-08-21,United Kingdom,,,210.82,,156.49,14
2955,Cory Peterson,1970-01-01,USA,,,210.82,,181.44,14
3141,Paulo Cesar Silva,1963-07-21,USA,,,226.06,,174.63,14
3414,Joe Solecki,,USA,,,445.26,458.47,,14


Видим 2 выброса - заменим эти данные на настоящий рост бойцов

In [36]:
fighters_df.loc[3141, 'height'] = 218.0
fighters_df.loc[3414, 'height'] = 175.26

Теперь можно заполнить пропуски

In [37]:
fighters_df = fighters_df.drop('armSpan', axis=1)

mean_height = fighters_df['height'].mean()
fighters_df['height'] = fighters_df['height'].fillna(mean_height)

Теперь рассмотри дату рождения. Преобразуем ее в datetime и заполним пропуски медианой года рождения

In [38]:
fighters_df['dateOfBirth'] = pd.to_datetime(fighters_df['dateOfBirth'])

In [39]:
median_dob = fighters_df['dateOfBirth'].dt.year.median()
median_dob = int(median_dob)
median_dob = datetime(year=1982, month=1, day=1)

fighters_df['dateOfBirth'] = fighters_df['dateOfBirth'].fillna(median_dob)

Удалим признак city, т.к. он заполнен только у 37% строк

In [40]:
fighters_df = fighters_df.drop('city', axis=1)

Заполним пропуски признака timezone временной зоной страны

In [41]:
timezones = {'USA': 'America/New_York',
'United Kingdom': 'Europe/London',
'Congo': 'Africa/Brazzaville',
'Bosnia Herzegovina': 'Europe/Paris',
'Netherlands': 'Europe/Amsterdam',
'Brazil': 'America/Sao_Paulo',
'Armenia': 'Asia/Yerevan',
'Taiwan': 'Asia/Taipei',
'Russia': 'Europe/Moscow',
'Ukraine': 'Europe/Kiev',
'Western Samoa': 'Asia/Riyadh'}

In [42]:
mask = fighters_df['timezone'].isna()
fighters_df.loc[mask, 'timezone'] = fighters_df[mask]['country'].apply(lambda x:timezones[x])

Рассмотрим вес, сначала изучим выбросы

In [43]:
zscore_weight = stats.zscore(fighters_df['weight'], nan_policy='omit')
zscore_weight = np.abs(zscore_weight)
fighters_df[zscore_weight > 5]

Unnamed: 0_level_0,name,dateOfBirth,country,timezone,height,weight,weightCategory.id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1876,Thomas Ramirez,1970-01-01,USA,America/Puerto_Rico,185.42,185.97,14
1893,John Matua,1970-01-01,USA,America/New_York,187.96,181.44,14
1909,Emmanuel Yarborough,1960-09-05,USA,America/New_York,203.2,349.27,14
1925,Teila Tuli,1969-06-14,USA,America/New_York,182.88,195.04,14
2607,Mitsuharu Kitao,1970-01-01,USA,America/New_York,200.66,176.9,14
2707,Alexandru Lungu,1974-09-03,Romania,Europe/Bucharest,182.88,174.63,14
2736,Wagner da Conceicao Martins,1978-05-19,USA,America/New_York,200.66,176.9,14
2955,Cory Peterson,1970-01-01,USA,America/New_York,210.82,181.44,14
3141,Paulo Cesar Silva,1963-07-21,USA,America/New_York,218.0,174.63,14


Заменим вес выброса на реальный

In [44]:
fighters_df.loc[1909, 'weight'] = 270

Т.к. у нас везде есть признака весовой категории, заполним пропуски веса средним весом для каждой категории

In [45]:
avg_weight_in_category = fighters_df.groupby(by="weightCategory.id").mean()['weight'].to_dict()

mask = fighters_df['weight'].isna()
fighters_df.loc[mask, 'weight'] = fighters_df[mask]['weightCategory.id'].apply(lambda x:avg_weight_in_category[x])

Данные о бойцах готовы!

In [46]:
fighters_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3669 entries, 1 to 3677
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   name               3669 non-null   object        
 1   dateOfBirth        3669 non-null   datetime64[ns]
 2   country            3669 non-null   object        
 3   timezone           3669 non-null   object        
 4   height             3669 non-null   float64       
 5   weight             3669 non-null   float64       
 6   weightCategory.id  3669 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 389.3+ KB


### Данные о боях

In [47]:
events_df = pd.read_csv(os.path.join(DATA_FOLDER, '0.events_raw.csv'), index_col='id')
events_df['eventDate.date'] = pd.to_datetime(events_df['eventDate.date'])

Удаляем лишние признаки

In [48]:
events_df = events_df.drop(['Unnamed: 0', 'link', 'name', 'eventDate.timezone' ,'eventDate.timezone_type', 'weightCategory.name'], axis=1)

Создадим бинарный признак победителя

In [49]:
events_df['winner'] = (events_df['winnerId'] == events_df['fighterId_1'])

Убираем строки с незавершенными боями, боями где отсутствует winnerId, боями где winnerId не соответствует ни одному из участников

In [50]:
mask = events_df['completed'] == True
mask = mask & (~events_df['winnerId'].isna())
mask = mask & ((events_df['winnerId'] == events_df['fighterId_1']) \
               | (events_df['winnerId'] == events_df['fighterId_2']))

events_df = events_df[mask]
events_df = events_df.drop(['winnerId', 'completed'], axis=1)

Заполним пропуски в колонках city и country

In [51]:
mask = (~events_df['country'].isna()) & (events_df['city'].isna())
events_df[mask]

Unnamed: 0_level_0,avgOdds,city,country,duration,eventDate.date,fighterId_1,fighterId_2,fighters,rounds,timezone,weightCategory.id,winMethods,winner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5566,[],,Brazil,325.0,1997-06-15,848,2240,"[{'fighterId': 848, 'fightStats': {'hitsTotal'...",1.0,America/Sao_Paulo,7,['KO'],True
6211,[],,Brazil,30.0,1997-06-15,848,3162,"[{'fighterId': 848, 'fightStats': {'hitsTotal'...",1.0,America/Sao_Paulo,7,['SUB'],True
5766,[],,Brazil,74.0,2005-12-03,1934,2436,"[{'fighterId': 1934, 'fightStats': {'hitsTotal...",2.0,America/Sao_Paulo,8,['KO'],True


Есть 3 записи в которых есть country но нету city, заполним их вручную

In [52]:
events_df.loc[mask, 'city'] = 'Sao Paulo'

Заполним остальные города модой, и страной которая соответствует этому городу

In [53]:
city = events_df['city'].mode()
country = 'USA'

mask = (events_df['country'].isna()) & (events_df['city'].isna())
events_df.loc[mask, ('country', 'city')] = np.array([country, city], dtype='object')

В некоторых записях содержатся выбросы которые не соответствуют городу, например "UFC 253". В этих же записях значение страны указано "---". Заполним эти выбросы

In [54]:
mask = events_df['city'].apply(lambda x:x.split(' ')[0]) == 'UFC'
events_df.loc[mask, ('country', 'city')] = np.array([country, city], dtype='object')

Заполним пропуски признаков duration, rounds средними значениями

In [55]:
duration = events_df['duration'].mean()
rounds = events_df['rounds'].mean()

mask = events_df['duration'].isna()
events_df.loc[mask, 'duration'] = duration

mask = events_df['rounds'].isna()
events_df.loc[mask, 'rounds'] = rounds

Заполним пропуски признака timezone

In [56]:
timezones = {'Las Vegas': 'America/Los_Angeles'}

mask = events_df['timezone'].isna()
events_df.loc[mask, 'timezone'] = events_df[mask]['city'].apply(lambda x:timezones[x])

Извлечем вложенные признаки из признака avgOdds

In [57]:
events_df[["f1_odds", "f2_odds"]] = events_df[["avgOdds", "fighterId_1",
                                                "fighterId_2"]]\
                                                .apply(lambda row: parse_odds(row), axis=1)

В старых записях отсутствуют коэффициенты букмекера, заполним их значением по умолчанию

In [58]:
events_df.loc[:, ('f1_odds', 'f2_odds')] = events_df[['f1_odds', 'f2_odds']].fillna(1)

Извлечем вложенные признаки из признака fighters

In [59]:
fighter_attack_stats_cols, fighter_def_stats_cols = get_fighter_stats_cols()

events_df[fighter_attack_stats_cols] = events_df[
        ["fighters", "fighterId_1", "fighterId_2"]
    ].apply(lambda row: parse_fight_data_attack(row), axis=1)

events_df[fighter_def_stats_cols] = events_df[
        ["fighters", "fighterId_1", "fighterId_2"]
    ].apply(lambda row: parse_fight_data_defence(row), axis=1)

In [60]:
events_df = events_df.drop(['avgOdds', 'fighters'], axis=1)

В небольшом количестве записей отсутствует полная статистика боев - исключим их из данных

In [61]:
mask = ~(events_df['f1_protectionPassage'].isna() | events_df['f1_def_hitsBodyTotal'].isna())
events_df = events_df[mask]

Все пропуски заполнены, можно создавать новые признаки

### Объединяем, создаем признаки

In [62]:
events_df = events_df.merge(fighters_df.add_prefix('f1_'), how='left',
                            left_on='fighterId_1', right_index=True)
events_df = events_df.merge(fighters_df.add_prefix('f2_'), how='left',
                            left_on='fighterId_2', right_index=True)

Создадим признаки возраста на момент боя

In [63]:
events_df['f1_age'] = ((events_df['eventDate.date'] \
  - events_df['f1_dateOfBirth']) \
 / np.timedelta64(1, 'Y')).astype(int)

events_df['f2_age'] = ((events_df['eventDate.date'] \
  - events_df['f2_dateOfBirth']) \
 / np.timedelta64(1, 'Y')).astype(int)

Создадим признаки isHomeTimezone, isHomeCountry

In [64]:
events_df['f1_isHomeTimezone'] = (events_df['f1_timezone'] == events_df['timezone']).apply(int)
events_df['f2_isHomeTimezone'] = (events_df['f2_timezone'] == events_df['timezone']).apply(int)

events_df['f1_isHomeCountry'] = (events_df['f1_country'] == events_df['country']).apply(int)
events_df['f2_isHomeCountry'] = (events_df['f2_country'] == events_df['country']).apply(int)

Считаем накопительную статистику

In [65]:
events_df['f1_winner'] = events_df['winner'].apply(int)
events_df['f2_winner'] = ~events_df['winner'].apply(int)

Используем one hot encoding для признаков метода победы

In [66]:
events_df = pd.get_dummies(events_df, columns = ['winMethods'], dtype=int)

events_df['f1_winMethods_DEC'] = np.where(events_df['f1_winner'], events_df['winMethods_[\'DEC\']'], 0)
events_df['f1_winMethods_DQ'] = np.where(events_df['f1_winner'], events_df['winMethods_[\'DQ\']'], 0)
events_df['f1_winMethods_KO'] = np.where(events_df['f1_winner'], events_df['winMethods_[\'KO\']'], 0)
events_df['f1_winMethods_SUB'] = np.where(events_df['f1_winner'], events_df['winMethods_[\'SUB\']'], 0)

events_df['f2_winMethods_DEC'] = np.where(events_df['f2_winner'], events_df['winMethods_[\'DEC\']'], 0)
events_df['f2_winMethods_DQ'] = np.where(events_df['f2_winner'], events_df['winMethods_[\'DQ\']'], 0)
events_df['f2_winMethods_KO'] = np.where(events_df['f2_winner'], events_df['winMethods_[\'KO\']'], 0)
events_df['f2_winMethods_SUB'] = np.where(events_df['f2_winner'], events_df['winMethods_[\'SUB\']'], 0)

events_df = events_df.drop(["winMethods_['DEC']", "winMethods_['DQ']",
                            "winMethods_['KO']", "winMethods_['SUB']",
                            'winMethods_[]'], axis=1)

win_methods = ['winMethods_DEC', 'winMethods_DQ', 'winMethods_KO', 'winMethods_SUB']

Сохраним названия колонок для аггрегации

In [67]:
agg_columns = fighter_stats_keys \
            + [f'def_{i}' for i in fighter_stats_keys] \
            + ['winner'] \
            + win_methods

f1_agg_columns = [f'f1_{i}' for i in agg_columns]
f2_agg_columns = [f'f2_{i}' for i in agg_columns]

Будем аггрегировать общее время проведенное в боях для каждого бойца

In [68]:
events_df['f1_duration'] = 0
events_df['f2_duration'] = 0

Соберем данные по каждому бойцу из таблицы с боями

In [69]:
fighter_to_agg_dict = {'fighterId_1': 'fighterId', 'fighterId_2': 'fighterId'}
fighter_to_agg_dict = {**fighter_to_agg_dict, \
                       **dict(zip(f1_agg_columns, agg_columns)),\
                       **dict(zip(f2_agg_columns, agg_columns))}

player_stats_agg_f1 = events_df.loc[:, ['eventDate.date', 'fighterId_1', 'duration'] + f1_agg_columns]
player_stats_agg_f1 = player_stats_agg_f1.rename(fighter_to_agg_dict, axis=1)

player_stats_agg_f2 = events_df.loc[:, ['eventDate.date', 'fighterId_2', 'duration'] + f2_agg_columns]
player_stats_agg_f2 = player_stats_agg_f2.rename(fighter_to_agg_dict, axis=1)

player_stats_agg = player_stats_agg_f1.append(player_stats_agg_f2, ignore_index=True)
player_stats_agg.sample(5)

Unnamed: 0,eventDate.date,fighterId,duration,hitsTotal,hitsSuccessful,takedownTotal,takedownSuccessful,submissionAttempts,takeovers,accentedHitsTotal,...,def_accentedHitsPositionDistanceSuccessful,def_accentedHitsPositionClinchTotal,def_accentedHitsPositionClinchSuccessful,def_accentedHitsPositionParterTotal,def_accentedHitsPositionParterSuccessful,winner,winMethods_DEC,winMethods_DQ,winMethods_KO,winMethods_SUB
9236,2011-01-29,3312,193.0,66.0,47.0,1.0,1.0,0.0,0.0,26.0,...,2.0,0.0,0.0,0.0,0.0,-1,0,0,1,0
11528,2016-07-24,876,300.0,30.0,8.0,0.0,0.0,0.0,0.0,30.0,...,36.0,3.0,1.0,0.0,0.0,-2,1,0,0,0
9549,2011-12-03,812,214.0,24.0,4.0,2.0,0.0,0.0,0.0,24.0,...,19.0,0.0,0.0,30.0,21.0,-2,0,0,1,0
11321,2016-02-06,721,137.0,3.0,1.0,1.0,0.0,0.0,0.0,3.0,...,1.0,0.0,0.0,26.0,13.0,-2,0,0,1,0
11179,2015-10-24,868,300.0,38.0,15.0,1.0,0.0,0.0,0.0,32.0,...,7.0,4.0,3.0,0.0,0.0,-2,1,0,0,0


In [70]:
player_stats_agg = player_stats_agg.sort_values('eventDate.date')

Аггрегируем данные

In [71]:
player_stats_agg = player_stats_agg.groupby(['fighterId', 'eventDate.date']).sum().groupby(level=0).cumsum()
player_stats_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,hitsTotal,hitsSuccessful,takedownTotal,takedownSuccessful,submissionAttempts,takeovers,accentedHitsTotal,accentedHitsSuccessful,knockdowns,...,def_accentedHitsPositionDistanceSuccessful,def_accentedHitsPositionClinchTotal,def_accentedHitsPositionClinchSuccessful,def_accentedHitsPositionParterTotal,def_accentedHitsPositionParterSuccessful,winner,winMethods_DEC,winMethods_DQ,winMethods_KO,winMethods_SUB
fighterId,eventDate.date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2019-10-19 04:00:00,300.0,51.0,35.0,0.0,0.0,0.0,0.0,51.0,35.0,0.0,...,13.0,0.0,0.0,0.0,0.0,1,0,0,0,0
1,2019-12-21 00:00:00,600.0,88.0,48.0,0.0,0.0,0.0,0.0,88.0,48.0,0.0,...,33.0,1.0,1.0,0.0,0.0,1,0,0,0,0
1,2020-06-27 00:00:00,761.0,110.0,63.0,0.0,0.0,0.0,0.0,110.0,63.0,0.0,...,39.0,1.0,1.0,0.0,0.0,2,0,0,1,0
1,2020-07-25 00:00:00,917.0,146.0,91.0,0.0,0.0,0.0,0.0,146.0,91.0,0.0,...,46.0,2.0,1.0,0.0,0.0,3,0,0,2,0
1,2020-11-07 00:00:00,1217.0,201.0,120.0,0.0,0.0,0.0,0.0,201.0,120.0,0.0,...,60.0,3.0,2.0,0.0,0.0,3,0,0,2,0


Т.к. нам нужна статистика, которая не включает текущий бой, сместим данные на единицу

In [72]:
player_stats_agg = player_stats_agg.groupby(level=0).shift().fillna(0)
player_stats_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,hitsTotal,hitsSuccessful,takedownTotal,takedownSuccessful,submissionAttempts,takeovers,accentedHitsTotal,accentedHitsSuccessful,knockdowns,...,def_accentedHitsPositionDistanceSuccessful,def_accentedHitsPositionClinchTotal,def_accentedHitsPositionClinchSuccessful,def_accentedHitsPositionParterTotal,def_accentedHitsPositionParterSuccessful,winner,winMethods_DEC,winMethods_DQ,winMethods_KO,winMethods_SUB
fighterId,eventDate.date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2019-10-19 04:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-12-21 00:00:00,300.0,51.0,35.0,0.0,0.0,0.0,0.0,51.0,35.0,0.0,...,13.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2020-06-27 00:00:00,600.0,88.0,48.0,0.0,0.0,0.0,0.0,88.0,48.0,0.0,...,33.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2020-07-25 00:00:00,761.0,110.0,63.0,0.0,0.0,0.0,0.0,110.0,63.0,0.0,...,39.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0
1,2020-11-07 00:00:00,917.0,146.0,91.0,0.0,0.0,0.0,0.0,146.0,91.0,0.0,...,46.0,2.0,1.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0


Создадим вспомогательную функцию чтобы заполнить данные в таблице о боях нашими аггрегированными статистиками

In [73]:
def fill_with_agg(row):
    key = (row['fighterId_1'], row['eventDate.date'])
    values = player_stats_agg.loc[key]
    values = values.add_prefix('f1_')
    row.loc[values.index] = values

    key = (row['fighterId_2'], row['eventDate.date'])
    values = player_stats_agg.loc[key]
    values = values.add_prefix('f2_')
    row.loc[values.index] = values

    return row

In [74]:
events_df = events_df.apply(fill_with_agg, axis=1)
events_df.tail(5)

Unnamed: 0_level_0,city,country,duration,eventDate.date,fighterId_1,fighterId_2,rounds,timezone,weightCategory.id,winner,...,f1_winMethods_DEC,f1_winMethods_DQ,f1_winMethods_KO,f1_winMethods_SUB,f2_winMethods_DEC,f2_winMethods_DQ,f2_winMethods_KO,f2_winMethods_SUB,f1_duration,f2_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40497,Las Vegas,USA,300.0,2021-02-27,421,668,3.0,America/Los_Angeles,5,False,...,2.0,0.0,4.0,0.0,3.0,0.0,0.0,1.0,1236.0,1225.0
40498,Las Vegas,USA,158.0,2021-02-27,3504,3521,3.0,America/Los_Angeles,3,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40500,Las Vegas,USA,300.0,2021-02-27,1334,3463,3.0,America/Los_Angeles,8,True,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,526.0,298.0
40452,Las Vegas,USA,208.0,2021-03-06,246,2073,3.0,America/Los_Angeles,6,False,...,5.0,0.0,2.0,6.0,2.0,0.0,0.0,1.0,3475.0,707.0
40453,Las Vegas,USA,295.0,2021-03-06,628,687,1.0,America/Los_Angeles,2,False,...,2.0,0.0,1.0,0.0,5.0,0.0,0.0,0.0,776.0,1548.0


Создадим новый признак - число боев для каждого бойца

In [75]:
fights = player_stats_agg.groupby(level=0).count()
fights = fights.rename({'hitsTotal': 'n_fights'}, axis=1)['n_fights']
fights = pd.DataFrame(fights)

In [76]:
events_df = events_df.merge(fights.add_prefix('f1_'), how='left', left_on='fighterId_1', right_index=True)
events_df = events_df.merge(fights.add_prefix('f2_'), how='left', left_on='fighterId_2', right_index=True)

Создаем новые признаки

In [77]:
features = [('winMethods_KO', 'n_fights', 'win_ko_per_fight'),
            ('winMethods_SUB', 'n_fights', 'win_sub_per_fight'),
            ('winMethods_DEC', 'n_fights', 'win_dec_per_fight'),

            ('hitsSuccessful', 'hitsTotal', 'hits_acc'),
            ('accentedHitsSuccessful', 'accentedHitsTotal', 'accented_hits_acc'),
            ('takedownSuccessful', 'takedownTotal', 'takedown_acc'),
            ('hitsLegsSuccessful', 'hitsLegsTotal', 'hit_legs_acc'),
            ('hitsBodySuccessful', 'hitsBodyTotal', 'hit_body_acc'),
            ('hitsHeadSuccessful', 'hitsHeadTotal', 'hit_head_acc'),
            ('takeovers', 'n_fights', 'takeovers_per_fight'),

            ('hitsTotal', 'duration', 'hits_per_min'),
            ('accentedHitsTotal', 'duration', 'accented_hits_per_min'),
            ('submissionAttempts', 'duration', 'submission_attempts_per_min'),
            
            ('duration', 'n_fights', 'avg_fight_time')]

for feature in features:
    for pre in ['f1_', 'f2_']:
        events_df[f'{pre}{feature[2]}'] = events_df[f'{pre}{feature[0]}'] / events_df[f'{pre}{feature[1]}']
        events_df[f'{pre}{feature[2]}'] = events_df[f'{pre}{feature[2]}'].fillna(0)

Финальный список фич

In [78]:
features_final = ['eventDate.date', 'duration', 'rounds', 'winner', 'f1_odds',
                  'f2_odds', 'f1_height', 'f1_weight', 'f2_height', 'f2_weight',
                  'f1_age', 'f2_age', 'f1_isHomeTimezone', 'f2_isHomeTimezone',
                  'f1_isHomeCountry', 'f2_isHomeCountry', 'f1_n_fights',
                  'f2_n_fights', 'f1_win_ko_per_fight', 'f2_win_ko_per_fight',
                  'f1_win_sub_per_fight', 'f2_win_sub_per_fight',
                  'f1_win_dec_per_fight', 'f2_win_dec_per_fight',
                  'f1_hits_acc', 'f2_hits_acc', 'f1_accented_hits_acc',
                  'f2_accented_hits_acc', 'f1_takedown_acc', 'f2_takedown_acc',
                  'f1_hit_legs_acc', 'f2_hit_legs_acc', 'f1_hit_body_acc',
                  'f2_hit_body_acc', 'f1_hit_head_acc', 'f2_hit_head_acc',
                  'f1_takeovers_per_fight', 'f2_takeovers_per_fight',
                  'f1_hits_per_min', 'f2_hits_per_min', 'f1_accented_hits_per_min',
                  'f2_accented_hits_per_min', 'f1_submission_attempts_per_min',
                  'f2_submission_attempts_per_min', 'f1_avg_fight_time', 
                  'f2_avg_fight_time']

events_df = events_df.loc[:, features_final]

### Сохраним в формате pickle

In [80]:
path = os.path.join(DATA_FOLDER, 'events_df.bin')
with open(path, 'wb') as f:
    pickle.dump(events_df, f)