In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

pd.set_option('display.max_columns', 100)

from data_prep_functions import *

In [19]:
base = pd.read_csv("./EPL_data_sofa_score/game_data_EPL_18_19.csv", sep=";")

In [20]:
base["data"] = pd.to_datetime(base['data'], format="%Y-%m-%d")

## Preparação das Variáveis

In [21]:
def str_percentage_to_float(x):
    x = x.replace("%", "")
    return(float(x)/100)


def split_statistics(base, coluna, new_name, drop_old=True):    
    temp = [x.split(" ")[0] for x in base[coluna + "_away"]]
    
    # Se tem /, então divide em duas colunas
    if "/" in temp[0]:
        base["Attempted_" + new_name + "_away"] = [int(x.split("/")[1]) for x in temp]
        base["Completed_" + new_name + "_away"] = [int(x.split("/")[0]) for x in temp]
    else:
    # Apenas o total
        base["Total_" + new_name + "_away"] = temp
        base["Total_" + new_name + "_away"] = base["Total_" + new_name + "_away"].astype(int)
    
    base[new_name + "_accuracy_away"] = [x.split(" ")[1].replace("(", "").replace(")", "") 
                                     for x in base[coluna + "_away"]]
    base[new_name + "_accuracy_away"] = base[new_name + "_accuracy_away"].apply(str_percentage_to_float)
    
    temp = [x.split(" ")[0] for x in base[coluna + "_home"]]    
    if "/" in temp[0]:
        base["Attempted_" + new_name + "_home"] = [int(x.split("/")[1]) for x in temp]
        base["Completed_" + new_name + "_home"] = [int(x.split("/")[0]) for x in temp]
    else:
        base["Total_" + new_name + "_home"] = temp
        base["Total_" + new_name + "_home"] = base["Total_" + new_name + "_home"].astype(int)
    
    base[new_name + "_accuracy_home"] = [x.split(" ")[1].replace("(", "").replace(")", "") 
                                         for x in base[coluna + "_home"]]
    base[new_name + "_accuracy_home"] = base[new_name + "_accuracy_home"].apply(str_percentage_to_float)
    
    if drop_old:
        base.drop([coluna + "_away", coluna + "_home"], axis=1, inplace=True)

In [22]:
split_statistics(base, "Accurate passes", "Passes")
split_statistics(base, "Crosses", "Crosses")
split_statistics(base, "Dribbles", "Dribbles")
split_statistics(base, "Long balls", "Long_balls")
split_statistics(base, "Tackles", "Tackles")

In [23]:
base["Ball possession_away"] = base["Ball possession_away"].apply(str_percentage_to_float)
base["Ball possession_home"] = base["Ball possession_home"].apply(str_percentage_to_float)

In [24]:
base.fillna(0, inplace=True)

## Cria Features de resposta

In [25]:
base["fl_home_win"] = base["result"].apply(lambda x: (x == 1) * 1)
base["fl_away_win"] = base["result"].apply(lambda x: (x == -1) * 1)
base["fl_draw"] = base["result"].apply(lambda x: (x == 0) * 1)

## Cria Features de dias

In [26]:
base["DAYS_FROM_LAST_GAME_home"] = [get_days_from_last_game(base, x.data, x.team_home) 
                                    for x in base.itertuples()]
base["DAYS_FROM_LAST_GAME_away"] = [get_days_from_last_game(base, x.data, x.team_away) 
                                    for x in base.itertuples()]

## Features baseadas na pressão por minuto

In [27]:
def pos_neg_counts(a):
    mask = a > 0
    idx = np.flatnonzero(mask[1:] != mask[:-1])
    count = np.concatenate(( [idx[0]+1], idx[1:] - idx[:-1], [a.size-1-idx[-1]] ))
    if a[0]<0:
        return count[1::2], count[::2] # pos, neg counts
    else:
        return count[::2], count[1::2] # pos, neg counts

In [28]:
def cria_features_form_minute(base, i):
    r = {}
    
    cols = [x for x in base.columns if "form_minute" in x]
    linha = base[cols].iloc[i]
    
    # ----------
    # Dominance 
    # ----------
    # Home
    home_dom = linha[linha > 0]
    r["minutes_dominant_home"] = len(home_dom)
    r["total_dominance_home"] = home_dom.sum()
    r["avg_dominance_home"] = home_dom.mean()
    r["max_dominance_home"] = home_dom.max()
    r["min_dominance_home"] = home_dom.min()
    r["std_dominance_home"] = home_dom.std()
    
    # Away
    away_dom = linha[linha < 0] * -1
    r["minutes_dominant_away"] = len(away_dom)
    r["total_dominance_away"] = away_dom.sum()
    r["avg_dominance_away"] = away_dom.mean()
    r["max_dominance_away"] = away_dom.max()
    r["min_dominance_away"] = away_dom.min()
    r["std_dominance_away"] = away_dom.std()
    
    # -----------------
    # Minutes Sequence
    # -----------------
    pos_counts, neg_counts = pos_neg_counts(np.array(linha))
    # Home
    r["max_minutes_sequence_dominant_home"] = max(pos_counts)
    r["std_minutes_sequence_dominant_home"] = np.std(pos_counts)
    
    # Away
    r["max_minutes_sequence_dominant_away"] = max(neg_counts)
    r["std_minutes_sequence_dominant_away"] = np.std(neg_counts)
    
    # ----------------
    # Comparing Teams
    # ----------------
    r["minutes_draw"] = len(linha[linha == 0])
    
    return(r)

In [29]:
temp = []
for i in tqdm_notebook(range(len(base))):
    temp.append(cria_features_form_minute(base, i))
    
temp = pd.DataFrame.from_dict(temp)
base = pd.concat([base, temp], axis=1)

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))




## Estatísticas Médias Últimos 5 Jogos

In [30]:
home_columns = [x for x in base.columns if "_home" in x]
away_columns = [x for x in base.columns if "_away" in x]

In [31]:
data_ref = base["data"].iloc[80]

In [32]:
last_games = get_last_games(base, data_ref, "LIV", n = 5, filter="all", verbose=False)

In [33]:
last_games

Unnamed: 0,game,Aerials won_away,Aerials won_home,Ball possession_away,Ball possession_home,Big chances missed_away,Big chances missed_home,Big chances_away,Big chances_home,Blocked shots_away,Blocked shots_home,Clearances_away,Clearances_home,Corner kicks_away,Corner kicks_home,Counter attacks_away,Counter attacks_home,Dispossessed_away,Dispossessed_home,Duels won_away,Duels won_home,Fouls_away,Fouls_home,Goalkeeper saves_away,Goalkeeper saves_home,Hit woodwork_away,Hit woodwork_home,Interceptions_away,Interceptions_home,Offsides_away,Offsides_home,Passes_away,Passes_home,Red cards_away,Red cards_home,Shots inside box_away,Shots inside box_home,Shots off target_away,Shots off target_home,Shots on target_away,Shots on target_home,Shots outside box_away,Shots outside box_home,Total shots_away,Total shots_home,Yellow cards_away,Yellow cards_home,away_score,data,form_minute_1,...,Total_Passes_away,Passes_accuracy_away,Total_Passes_home,Passes_accuracy_home,Attempted_Crosses_away,Completed_Crosses_away,Crosses_accuracy_away,Attempted_Crosses_home,Completed_Crosses_home,Crosses_accuracy_home,Attempted_Dribbles_away,Completed_Dribbles_away,Dribbles_accuracy_away,Attempted_Dribbles_home,Completed_Dribbles_home,Dribbles_accuracy_home,Attempted_Long_balls_away,Completed_Long_balls_away,Long_balls_accuracy_away,Attempted_Long_balls_home,Completed_Long_balls_home,Long_balls_accuracy_home,Attempted_Tackles_away,Completed_Tackles_away,Tackles_accuracy_away,Attempted_Tackles_home,Completed_Tackles_home,Tackles_accuracy_home,fl_home_win,fl_away_win,fl_draw,DAYS_FROM_LAST_GAME_home,DAYS_FROM_LAST_GAME_away,avg_dominance_away,avg_dominance_home,max_dominance_away,max_dominance_home,max_minutes_sequence_dominant_away,max_minutes_sequence_dominant_home,min_dominance_away,min_dominance_home,minutes_dominant_away,minutes_dominant_home,minutes_draw,std_dominance_away,std_dominance_home,std_minutes_sequence_dominant_away,std_minutes_sequence_dominant_home,total_dominance_away,total_dominance_home
30,LEI X LIV 2018-09-01,17,17,0.49,0.51,2.0,1.0,2.0,2.0,2,4,23,27,4,4,0.0,0.0,16,13,52,63,12,9,4.0,2.0,0.0,0.0,10,6,2.0,3.0,473,496,0.0,0.0,7,3,4,3,4,5,3,9,10,12,2.0,3.0,2,2018-09-01,6,...,374,0.79,401,0.81,8,3,0.38,17,4,0.24,15,8,0.53,17,12,0.71,58,26,0.45,59,29,0.49,18,13,0.72,23,16,0.7,0,1,0,7.0,7.0,31.219512,28.490196,100.0,87.0,12,10,2.0,1.0,41,51,0,24.87118,22.754668,3.54632,3.149152,1280.0,1453.0
40,TOT X LIV 2018-09-15,10,15,0.4,0.6,2.0,0.0,3.0,0.0,2,3,21,17,4,5,0.0,0.0,6,9,47,42,16,17,2.0,8.0,2.0,1.0,6,6,2.0,3.0,356,549,0.0,0.0,14,7,5,5,10,3,3,4,17,11,0.0,0.0,2,2018-09-15,-17,...,267,0.75,443,0.81,10,2,0.2,12,2,0.17,9,6,0.67,15,7,0.47,58,25,0.43,66,30,0.45,17,11,0.65,9,4,0.44,0,1,0,13.0,14.0,29.787234,22.6,100.0,112.0,12,8,1.0,2.0,47,45,0,26.048397,23.972901,3.199482,2.307692,1400.0,1017.0
55,LIV X SOU 2018-09-22,17,9,0.4,0.6,0.0,0.0,0.0,1.0,5,3,35,12,4,5,0.0,1.0,10,12,45,41,10,7,1.0,1.0,0.0,1.0,16,7,1.0,4.0,479,750,0.0,0.0,6,9,1,5,1,4,1,3,7,12,2.0,0.0,0,2018-09-22,10,...,388,0.81,648,0.86,14,3,0.21,13,3,0.23,5,1,0.2,17,8,0.47,73,33,0.45,44,25,0.57,21,11,0.52,14,10,0.71,1,0,0,7.0,5.0,15.057143,32.105263,73.0,137.0,10,21,1.0,2.0,35,57,0,15.876255,28.294803,2.729469,6.116371,527.0,1830.0
67,CHE X LIV 2018-09-29,13,11,0.53,0.47,3.0,4.0,3.0,4.0,1,2,16,23,4,4,0.0,0.0,9,11,42,37,9,7,3.0,3.0,0.0,0.0,12,12,2.0,2.0,616,558,0.0,0.0,7,9,6,4,6,4,6,1,13,10,2.0,0.0,1,2018-09-29,11,...,516,0.84,465,0.83,20,7,0.35,12,3,0.25,11,7,0.64,8,4,0.5,75,40,0.53,51,31,0.61,15,8,0.53,13,9,0.69,0,0,1,6.0,7.0,33.030769,30.230769,100.0,87.0,13,8,1.0,1.0,65,26,1,22.865756,23.787909,4.242641,2.034426,2147.0,786.0
79,LIV X MCI 2018-10-07,17,6,0.51,0.49,1.0,0.0,1.0,0.0,1,1,25,18,6,2,0.0,0.0,11,11,50,37,10,10,2.0,2.0,0.0,0.0,13,9,5.0,2.0,528,515,0.0,0.0,5,5,3,4,2,2,1,2,6,7,3.0,1.0,0,2018-10-07,-10,...,439,0.83,430,0.83,12,1,0.08,18,4,0.22,13,7,0.54,9,4,0.44,51,29,0.57,70,39,0.56,16,11,0.69,17,11,0.65,0,0,1,8.0,8.0,15.434783,17.173913,77.0,52.0,14,13,1.0,2.0,46,46,0,15.861417,10.677712,4.079216,3.929377,710.0,790.0


In [46]:
get_avg_last_games(last_games, "LIV", home_columns, away_columns,
                       n = 5, data_ref = data_ref, rivals = False, 
                       to_drop=["fl_win", "Total_passes", "result", "Accurate passes", "hora", "game"])

Unnamed: 0,Aerials won,Ball possession,Big chances missed,Big chances,Blocked shots,Clearances,Corner kicks,Counter attacks,Dispossessed,Duels won,Fouls,Goalkeeper saves,Hit woodwork,Interceptions,Offsides,Passes,Red cards,Shots inside box,Shots off target,Shots on target,Shots outside box,Total shots,Yellow cards,away_score,form_minute_1,form_minute_10,form_minute_11,form_minute_12,form_minute_13,form_minute_14,form_minute_15,form_minute_16,form_minute_17,form_minute_18,form_minute_19,form_minute_2,form_minute_20,form_minute_21,form_minute_22,form_minute_23,form_minute_24,form_minute_25,form_minute_26,form_minute_27,form_minute_28,form_minute_29,form_minute_3,form_minute_30,form_minute_31,form_minute_32,...,Dribbles_accuracy,Attempted_Long_balls,Completed_Long_balls,Long_balls_accuracy,Attempted_Tackles,Completed_Tackles,Tackles_accuracy,fl_win,fl_draw,DAYS_FROM_LAST_GAME,avg_dominance,max_dominance,max_minutes_sequence_dominant,min_dominance,minutes_dominant,minutes_draw,std_dominance,std_minutes_sequence_dominant,total_dominance,N_WINS_HOME,N_GAMES_HOME,WIN_HOME_PCT,N_WINS_AWAY,N_GAMES_AWAY,WIN_AWAY_PCT,N_WINS_TOTAL,WIN_PCT,TOTAL_DAYS_DIFF,DAYS_DIFF_LG_STD,DAYS_DIFF_LG_MEAN,N_GAMES_L2_days,N_GAMES_L4_days,N_GAMES_L6_days,N_GAMES_L8_days,N_GAMES_L10_days,N_GAMES_AWAY_L2_days,N_GAMES_AWAY_L4_days,N_GAMES_AWAY_L6_days,N_GAMES_AWAY_L8_days,N_GAMES_AWAY_L10_days,AVG_DAYS_FROM_LG,STD_DAYS_FROM_LG,MIN_DAYS_FROM_LG,total_minutes_dominant,total_dominance.1,avg_total_minutes_dominant,avg_total_dominance,max_dominance.1,min_dominance.1,avg_dominance.1
0,11.0,0.502,1.4,1.8,1.8,18.0,3.8,0.2,10.8,43.8,10.8,2.4,0.6,8.8,2.4,542.0,0.0,8.4,4.8,5.2,3.4,11.8,1.0,1.0,0.0,21.4,-9.4,-6.4,-11.4,-19.6,-20.8,-19.4,-6.2,-5.2,-4.2,1.8,-9.6,-30.6,-11.2,2.0,16.4,12.0,26.8,32.0,30.2,11.4,0.8,-1.0,8.2,15.2,...,0.55,61.0,31.0,0.508,16.2,10.6,0.652,0.6,0.4,8.6,28.663338,97.8,14.2,1.6,51.2,0.2,22.551569,4.206838,1489.4,1.0,2.0,0.5,2.0,3.0,0.666667,3.0,0.6,36.0,3.059412,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.6,2.727636,7.0,256.0,7447.0,51.2,1489.4,97.8,1.6,97.8


## Monta a base final

In [54]:
base_final = gera_last_N_games(base, all_games = None, N = [5],
                              to_drop=['fl_home_win', 'game', 'hora', 'result', 'data'])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  resp["WIN_HOME_PCT"] = [(resp["N_WINS_HOME"][0] / resp["N_GAMES_HOME"][0])]
  resp["WIN_AWAY_PCT"] = [(resp["N_WINS_AWAY"][0] / resp["N_GAMES_AWAY"][0])]
  resp["WIN_PCT"] = [resp["N_WINS_TOTAL"][0]/(resp["N_GAMES_AWAY"][0] + resp["N_GAMES_HOME"][0])]
  resp["WIN_HOME_PCT"] = [(resp["N_WINS_HOME"][0] / resp["N_GAMES_HOME"][0])]
  resp["WIN_AWAY_PCT"] = [(resp["N_WINS_AWAY"][0] / resp["N_GAMES_AWAY"][0])]
  resp["WIN_PCT"] = [resp["N_WINS_TOTAL"][0]/(resp["N_GAMES_AWAY"][0] + resp["N_GAMES_HOME"][0])]
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


ARS X MCI 2018-08-12 5


TypeError: unsupported operand type(s) for +: 'Timestamp' and 'float'