In [1]:
import pandas as pd
import datetime as dt
import numpy as np
from utils import ranked_probability_loss

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.options.mode.chained_assignment = None

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer
from sklearn.utils.validation import check_array


In [2]:
# Reading the data

bets = pd.read_csv("data/bets.zip")
booking = pd.read_csv("data/booking.zip")
goals = pd.read_csv("data/goals.zip")
matches = pd.read_csv("data/matches.zip")
stats = pd.read_csv("data/stats.zip")

In [3]:
# Converting epoch column to datetime
matches['timestamp'] = matches['epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))
bets['timestamp'] = bets['odd_epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))


In [4]:
def week_converter(timestamp):
  """
  year is 2019 for dates between 2019-07 and 2020-06, 
  22nd week just random splitter, 
  there might be better representation
  
  is_national is True for Friday, Saturday, Sunday, Monday 
  False otherwise
  """
  # year = (timestamp - dt.timedelta(1)).dt.strftime('%Y')
  s = '2016-03-14 17:24:55'  
  year, week, day = (timestamp - dt.timedelta(1)).isocalendar()
  season = year - 1 if week < 27 else year
  is_weekend = day >= 4
  date = timestamp - dt.timedelta(1)
  hour = timestamp.fromisoformat(s).hour
  return [date, season, year, week, is_weekend, hour]

In [5]:
matches[['date', 'season', 'year', 'week', 'is_weekend', "hour"]] = pd.DataFrame(matches.timestamp.apply(week_converter).values.tolist(), 
                                                       index=matches.index)

In [6]:
teams = matches.groupby(["match_awayteam_id", "match_awayteam_name"]).size().reset_index()

In [7]:
matches = matches.sort_values("date")

In [18]:
team_match = pd.DataFrame()
    
for index, row in matches.iterrows():
    away_team = row["match_awayteam_id"]
    home_team = row["match_hometeam_id"]
    
    dt = [[away_team,
          row["match_awayteam_name"],
          row["match_id"],
          "Away",
          row["season"],
          row["date"],
          row["hour"],
          row["match_live"],
          row["match_awayteam_score"],
          row["match_hometeam_score"],
         ]]
    
    df = pd.DataFrame(dt, columns = ['TeamId', 'TeamName', "MatchId", "HomeAway", "Season", "Date", "Hour", "Live", "Scored", "Conceded"]) 

    team_match = team_match.append(df)
    
    dt = [[home_team,
          row["match_hometeam_name"],
          row["match_id"],
          "Home",
          row["season"],
          row["date"],
          row["hour"],
          row["match_live"],
          row["match_hometeam_score"],
          row["match_awayteam_score"]
         ]]
    
    df = pd.DataFrame(dt, columns = ['TeamId', 'TeamName', "MatchId", "HomeAway", "Season", "Date", "Hour", "Live", "Scored", "Conceded"]) 

    team_match = team_match.append(df)

In [15]:
def point(row):
   if row['Scored'] > row["Conceded"]:
      return 3
   if row['Scored'] < row["Conceded"]:
      return 0
   if row['Scored'] == row["Conceded"]:
      return 1

   return np.nan

team_match['Point'] = team_match.apply (lambda row: point(row), axis=1)

In [26]:
team_match["SeasonOrder"] = team_match.groupby(["TeamId", "Season"])["Date"].rank("dense", ascending=True)
team_match["OverallOrder"] = team_match.groupby(["TeamId"])["Date"].rank("dense", ascending=True)

In [120]:
roll1 = lambda x: x.rolling(1).sum().shift()
roll5 = lambda x: x.rolling(5).sum().shift()
historic = lambda x: x.expanding().mean().shift()

In [125]:
team_match["Point1"] = team_match.groupby(["Season", "TeamId"]).Point.apply(roll1).reset_index(0,drop=True)
team_match["GoalScored1"] = team_match.groupby(["Season", "TeamId"]).Scored.apply(roll1).reset_index(0,drop=True)
team_match["GoalConceded1"] = team_match.groupby(["Season", "TeamId"]).Conceded.apply(roll1).reset_index(0,drop=True)

team_match["Point5"] = team_match.groupby(["Season", "TeamId"]).Point.apply(roll5).reset_index(0,drop=True)
team_match["GoalScored5"] = team_match.groupby(["Season", "TeamId"]).Scored.apply(roll5).reset_index(0,drop=True)
team_match["GoalConceded5"] = team_match.groupby(["Season", "TeamId"]).Conceded.apply(roll5).reset_index(0,drop=True)

team_match["Point1Pos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Point.apply(roll1).reset_index(0,drop=True)
team_match["GoalScored1Pos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Scored.apply(roll1).reset_index(0,drop=True)
team_match["GoalConceded1Pos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Conceded.apply(roll1).reset_index(0,drop=True)


In [94]:
team_match["PerformanceSeason"] = team_match.groupby(["Season", "TeamId"]).Point.apply(historic).reset_index(0,drop=True)
team_match["PerformanceHistoric"] = team_match.groupby(["TeamId"]).Point.apply(historic).reset_index(0,drop=True)


In [114]:
team_match["Won"] = [1 if x == 3 else 0 for x in team_match['Point']]
team_match["Draw"] = [1 if x == 3 else 0 for x in team_match['Point']]
team_match["Lost"] = [1 if x == 3 else 0 for x in team_match['Point']]

In [124]:
team_match["DrawRatio"] = team_match.groupby(["TeamId"]).Draw.apply(historic).reset_index(0,drop=True)
team_match["WinRatio"] = team_match.groupby(["TeamId"]).Won.apply(historic).reset_index(0,drop=True)
team_match["LostRatio"] = team_match.groupby(["TeamId"]).Lost.apply(historic).reset_index(0,drop=True)

team_match["DrawRatioSeason"] = team_match.groupby(["Season", "TeamId"]).Draw.apply(historic).reset_index(0,drop=True)
team_match["WinRatioSeason"] = team_match.groupby(["Season", "TeamId"]).Won.apply(historic).reset_index(0,drop=True)
team_match["LostRatioSeason"] = team_match.groupby(["Season", "TeamId"]).Lost.apply(historic).reset_index(0,drop=True)

team_match["DrawRatioPos"] = team_match.groupby(["TeamId", "HomeAway"]).Draw.apply(historic).reset_index(0,drop=True)
team_match["WinRatioPos"] = team_match.groupby(["TeamId", "HomeAway"]).Won.apply(historic).reset_index(0,drop=True)
team_match["LostRatioPos"] = team_match.groupby(["TeamId", "HomeAway"]).Lost.apply(historic).reset_index(0,drop=True)

team_match["DrawRatioSeasonPos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Draw.apply(historic).reset_index(0,drop=True)
team_match["WinRatioSeasonPos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Won.apply(historic).reset_index(0,drop=True)
team_match["LostRatioSeasonPos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Lost.apply(historic).reset_index(0,drop=True)


In [161]:
team_match = team_match.drop(["Draw", "Won", "Lost"], axis = 1)
team_match = team_match.drop(["index"], axis = 1)


In [162]:
team_match

Unnamed: 0,TeamId,TeamName,MatchId,HomeAway,Season,Date,Hour,Live,Scored,Conceded,Point,SeasonOrder,OverallOrder,Point5,GoalScored5,GoalConceded5,Point1,GoalScored1,GoalConceded1,PerformanceSeason,PerformanceHistoric,DrawRatio,WinRatio,LostRatio,DrawRatioSeason,WinRatioSeason,LostRatioSeason,DrawRatioPos,WinRatioPos,LostRatioPos,DrawRatioSeasonPos,WinRatioSeasonPos,LostRatioSeasonPos,Point1Pos,GoalScored1Pos,GoalConceded1Pos
0,7109,Valencia,41196,Away,2017,2017-09-15 14:00:00,17,0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.000000,1.000000,,,,,,,,,,,,,,,
1,7097,Levante,41196,Home,2017,2017-09-15 14:00:00,17,0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.000000,1.000000,,,,,,,,,,,,,,,
2,2614,Southampton,13331,Away,2017,2017-09-15 14:30:00,17,0,1.0,0.0,3.0,1.0,1.0,,,,,,,3.000000,3.000000,,,,,,,,,,,,,,,
3,2619,Crystal Palace,13331,Home,2017,2017-09-15 14:30:00,17,0,0.0,1.0,0.0,1.0,1.0,,,,,,,0.000000,0.000000,,,,,,,,,,,,,,,
4,3224,FC Augsburg,17683,Away,2017,2017-09-15 16:30:00,17,0,2.0,1.0,3.0,1.0,1.0,,,,,,,3.000000,3.000000,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11567,2633,Hull,242369,Away,2019,2019-12-12 21:45:00,17,0,,,,22.0,86.0,,,,,,,1.368421,1.180723,0.294118,0.294118,0.294118,0.333333,0.333333,0.333333,0.219512,0.219512,0.219512,0.300000,0.300000,0.300000,,,
11568,3034,Montpellier,242828,Away,2019,2019-12-12 21:45:00,17,0,,,,20.0,84.0,,,,,,,1.333333,1.426829,0.349398,0.349398,0.349398,0.315789,0.315789,0.315789,0.300000,0.300000,0.300000,0.000000,0.000000,0.000000,1.0,2.0,2.0
11569,2663,Charlton,242369,Home,2019,2019-12-12 21:45:00,17,0,,,,22.0,30.0,,,,,,,1.210526,1.296296,0.310345,0.310345,0.310345,0.285714,0.285714,0.285714,0.500000,0.500000,0.500000,0.363636,0.363636,0.363636,,,
11570,7103,Leganes,262058,Away,2019,2019-12-12 22:00:00,17,0,,,,17.0,87.0,,,,,,,0.400000,0.976471,0.244186,0.244186,0.244186,0.062500,0.062500,0.062500,0.100000,0.100000,0.100000,0.000000,0.000000,0.000000,0.0,0.0,1.0


In [198]:
cols = list(range(2,3)) + list(range(14, 36))

home = team_match[team_match["HomeAway"] == 'Home'].iloc[:, cols]
away = team_match[team_match["HomeAway"] == 'Away'].iloc[:, cols]

In [203]:
team_stats = home.merge(away, on='MatchId', how='inner', suffixes=('_Home', '_Away'))

Unnamed: 0,MatchId,GoalScored5_Home,GoalConceded5_Home,Point1_Home,GoalScored1_Home,GoalConceded1_Home,PerformanceSeason_Home,PerformanceHistoric_Home,DrawRatio_Home,WinRatio_Home,LostRatio_Home,DrawRatioSeason_Home,WinRatioSeason_Home,LostRatioSeason_Home,DrawRatioPos_Home,WinRatioPos_Home,LostRatioPos_Home,DrawRatioSeasonPos_Home,WinRatioSeasonPos_Home,LostRatioSeasonPos_Home,Point1Pos_Home,GoalScored1Pos_Home,GoalConceded1Pos_Home,GoalScored5_Away,GoalConceded5_Away,Point1_Away,GoalScored1_Away,GoalConceded1_Away,PerformanceSeason_Away,PerformanceHistoric_Away,DrawRatio_Away,WinRatio_Away,LostRatio_Away,DrawRatioSeason_Away,WinRatioSeason_Away,LostRatioSeason_Away,DrawRatioPos_Away,WinRatioPos_Away,LostRatioPos_Away,DrawRatioSeasonPos_Away,WinRatioSeasonPos_Away,LostRatioSeasonPos_Away,Point1Pos_Away,GoalScored1Pos_Away,GoalConceded1Pos_Away
0,41196,,,,,,1.000000,1.000000,,,,,,,,,,,,,,,,,,,,,1.000000,1.000000,,,,,,,,,,,,,,,
1,13331,,,,,,0.000000,0.000000,,,,,,,,,,,,,,,,,,,,,3.000000,3.000000,,,,,,,,,,,,,,,
2,17683,,,,,,0.000000,0.000000,,,,,,,,,,,,,,,,,,,,,3.000000,3.000000,,,,,,,,,,,,,,,
3,17684,,,,,,0.000000,0.000000,,,,,,,,,,,,,,,,,,,,,3.000000,3.000000,,,,,,,,,,,,,,,
4,17682,,,,,,3.000000,3.000000,,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5781,245297,,,,,,1.687500,1.260274,0.310811,0.310811,0.310811,0.411765,0.411765,0.411765,0.315789,0.315789,0.315789,0.444444,0.444444,0.444444,3.0,3.0,2.0,,,,,,1.625000,1.263889,0.369863,0.369863,0.369863,0.470588,0.470588,0.470588,0.342857,0.342857,0.342857,0.375000,0.375000,0.375000,,,
5782,243280,,,,,,1.615385,1.417910,0.352941,0.352941,0.352941,0.428571,0.428571,0.428571,0.393939,0.393939,0.393939,0.428571,0.428571,0.428571,1.0,1.0,1.0,,,,,,1.076923,0.970149,0.220588,0.220588,0.220588,0.214286,0.214286,0.214286,0.166667,0.166667,0.166667,0.142857,0.142857,0.142857,1.0,1.0,1.0
5783,242828,,,,,,1.529412,1.450000,0.407407,0.407407,0.407407,0.388889,0.388889,0.388889,0.538462,0.538462,0.538462,0.600000,0.600000,0.600000,,,,,,,,,1.333333,1.426829,0.349398,0.349398,0.349398,0.315789,0.315789,0.315789,0.300000,0.300000,0.300000,0.000000,0.000000,0.000000,1.0,2.0,2.0
5784,242369,,,,,,1.210526,1.296296,0.310345,0.310345,0.310345,0.285714,0.285714,0.285714,0.500000,0.500000,0.500000,0.363636,0.363636,0.363636,,,,,,,,,1.368421,1.180723,0.294118,0.294118,0.294118,0.333333,0.333333,0.333333,0.219512,0.219512,0.219512,0.300000,0.300000,0.300000,,,


Unnamed: 0,TeamId,TeamName,MatchId,HomeAway,Season,Date,Hour,Live,Scored,Conceded,Point,SeasonOrder,OverallOrder,Point5,GoalScored5,GoalConceded5,Point1,GoalScored1,GoalConceded1,PerformanceSeason,PerformanceHistoric,DrawRatio,WinRatio,LostRatio,DrawRatioSeason,WinRatioSeason,LostRatioSeason,DrawRatioPos,WinRatioPos,LostRatioPos,DrawRatioSeasonPos,WinRatioSeasonPos,LostRatioSeasonPos,Point1Pos,GoalScored1Pos,GoalConceded1Pos
3,2619,Crystal Palace,13331,Home,2017,2017-09-15 14:30:00,17,0,0.0,1.0,0.0,1.0,1.0,,,,,,,0.0,0.0,,,,,,,,,,,,,,,
29,2637,Cardiff,13450,Home,2017,2017-09-15 17:00:00,17,0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0,1.0,,,,,,,,,,,,,,,
31,2638,Millwall,13454,Home,2017,2017-09-15 17:00:00,17,0,1.0,0.0,3.0,1.0,1.0,,,,,,,3.0,3.0,,,,,,,,,,,,,,,
33,7100,Getafe,41195,Home,2017,2017-09-15 17:15:00,17,0,1.0,2.0,0.0,1.0,1.0,,,,,,,0.0,0.0,,,,,,,,,,,,,,,
35,3039,Monaco,16527,Home,2017,2017-09-15 18:00:00,17,0,3.0,0.0,3.0,1.0,1.0,,,,,,,3.0,3.0,,,,,,,,,,,,,,,
37,4173,Fiorentina,24601,Home,2017,2017-09-15 19:00:00,17,0,2.0,1.0,3.0,1.0,1.0,,,,,,,3.0,3.0,,,,,,,,,,,,,,,
39,3226,RB Leipzig,17681,Home,2017,2017-09-15 19:30:00,17,0,2.0,2.0,1.0,1.0,1.0,,,,,,,1.0,1.0,,,,,,,,,,,,,,,
41,7606,Galatasaray,45297,Home,2017,2017-09-15 20:00:00,17,0,2.0,0.0,3.0,1.0,1.0,,,,,,,3.0,3.0,,,,,,,,,,,,,,,
42,3030,Dijon,16523,Home,2017,2017-09-15 21:00:00,17,0,0.0,1.0,0.0,1.0,1.0,,,,,,,0.0,0.0,,,,,,,,,,,,,,,
45,4184,AS Roma,24600,Home,2017,2017-09-15 21:45:00,17,0,3.0,0.0,3.0,1.0,1.0,,,,,,,3.0,3.0,,,,,,,,,,,,,,,


In [182]:
team_match[team_match["TeamId"] == 7103].iloc[:, cols]

IndexError: positional indexers are out-of-bounds