# Imports

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pprint import pprint

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

print(torch.__version__)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Parameter

In [None]:
# Data paths
data_path = '/kaggle/input/bundesliga-2010-2020'
model_dir = '/kaggle/models'

device = 'cpu'
#device = 'cuda'

# Dataset

In [None]:
###  Footbal Dataset Class  ###
class FootballDataset(Dataset):
    def __init__(self, data_path):
        self.data_path = data_path           # Path to the data
        self.df_md_dict = {}                 # Dictionary with the matchdays DataFrame for each season  (key: starting year of the season)
        self.df_results_dict = {}            # Dictionary with the match results DataFrame for each season  (key: starting year of the season)
        self.df_mv_dict = {}                 # Dictionary with the market value DataFrame for each season  (key: starting year of the season)
        self.team_list_dict = {}             # Dictionary with the team list for each season  (key: starting year of the season)
        
        # Load data
        self.__load_data__()
        
        # Compute database characteristics
        self.year_list = sorted(list(self.df_md_dict.keys()))   # List of the available years in the database
        self.database_len = self.__len__()                      # Database length
        self.total_team_list = self.get_total_team_list()       # List of all teams in the database (sorted alphabetically)
        self.total_team_dict = self.get_total_team_dict()       # Dict of all teams and its index in the database (sorted alphabetically)
        self.total_team_number = len(self.total_team_dict)      # Number of all teams in the database
        
        message = (
        f'Dataset loaded:\n'
        f'Dataset contains {len(self.year_list)} seasons from {self.year_list[0]}/{self.year_list[0]+1} to {self.year_list[-1]}/{self.year_list[-1]} '
        f'with {self.database_len} matchdays and {self.total_team_number} teams.'
        )
        print(message)
    
    
    # Load data from the csv files
    def __load_data__(self):
        # Load data stored in the csv files into Dataframes and store them in dictionaries
        for dirname, _, filenames in os.walk(self.data_path):
            for filename in filenames:        
                file_path = os.path.join(dirname, filename)
                file_name = os.path.join(dirname, filename).rsplit('/')[-1]
                year      = int(os.path.join(dirname, filename).rsplit('/')[-2])

                df = pd.read_csv(file_path, index_col=0)

                self.team_list_dict[year] = df.columns

                if file_name == 'matchdays.csv':        
                    self.df_md_dict[year] = df#.astype('int32', errors='ignore')
                elif file_name == 'results.csv':
                    self.df_results_dict[year] = df
                elif file_name == 'market_values.csv':
                    self.df_mv_dict[year] = df
    
                    
    # Return matchdays for a specific year
    def get_matchdays(self, year):
        if year not in self.year_list:
            print('Error: No data for this year available')
        return self.df_md_dict[year]
    
    
    # Return match results for a specific year
    def get_results(self, year):
        if year not in self.year_list:
            print('Error: No data for this year available')
        return self.df_results_dict[year]
    
    
    # Return market values for a specific year
    def get_market_values(self, year):
        if year not in self.year_list:
            print('Error: No data for this year available')
        return self.df_mv_dict[year]
    
    
    # Return team list for a specific year
    def get_team_list(self, year):
        if year not in self.year_list:
            print('Error: No data for this year available')
        return self.df_mv_dict[year]
    
    
    # Return list of all teams in the database
    def get_total_team_list(self):
        total_team_list = []
        for year in self.df_md_dict:
            team_list = self.df_md_dict[year].index.tolist()
            total_team_list.extend(team_list)
        total_team_list = list((sorted(set(total_team_list))))
        return total_team_list
    
    
    # Return list of all teams in the database
    def get_total_team_dict(self):
        total_team_list = []
        for year in self.df_md_dict:
            team_list = self.df_md_dict[year].index.tolist()
            total_team_list.extend(team_list)
        total_team_dict = {k: v for v, k in enumerate(sorted(set(total_team_list)))}
        #pprint.pprint(total_team_dict)
        return total_team_dict
    
    
    # Return the avaiable years in the database
    def get_year_list(self):
        return self.year_list
    
    
    # Return match list in a 2D list (Home team; Away Team) for a given year and matchday
    def get_match_list(self, year, matchday):
        df_md = self.get_matchdays(year)
        df_md = df_md[df_md == matchday]
        match_list = list(df_md.stack().index)
        return match_list
    
    
    # Return market value of a club at a given matchday
    def get_club_market_value(self, team, year, matchday):
        df_mv = dataset.get_market_values(year)
        market_value = df_mv.loc[team][str(matchday)]
        return market_value
    
    
    # Return match result in a specific season
    def get_match_result(self, year, home_team, away_team):
        match_result = self.get_results(year).loc[home_team][away_team]
        return match_result
    
    
    # Method to convert a dataloader index from [0,__len__] to a year and matchday to be used with dataset
    def idx_to_year_md(self, idx):
        year = self.year_list[idx//34]
        matchday = (idx)%34+1
        return year, matchday
    
    
    # Method to convert a dataloader index from [0,__len__] to a year and matchday to be used with dataset
    def year_md_to_idx(self, year, matchday):
        year_idx = self.year_list.index(year)
        idx = year_idx*34 + matchday - 1
        return idx
    
    
    # Method to convert match result to two scores for the two teams
    def result_to_scores(self, match_result):
        # Some very few games could be posponed and therefore do not have an match result -> (0:0)
        try:
            goals = match_result.strip().split(':')
        except:
            goals = [0,0]

        score_home = int(goals[0]) - int(goals[1])
        score_away = int(goals[1]) - int(goals[0])
        return score_home, score_away
    
    
    # For each match saved in a data sample, return the team names, team scores, market values and numerical index of the home and away team.
    # Format: [home_team, away_team, score_home, score_away, mv_home, mv_away, home_idx, away_idx]
    def match_to_match_info(self, match):
        home_team = match[0][0]
        away_team = match[1][0]
        score_home = match[2].float()
        score_away = match[3].float()
        mv_home = match[4].float()
        mv_away = match[5].float()
        home_idx = self.total_team_dict[home_team]
        away_idx = self.total_team_dict[away_team]
        
        return home_team, away_team, score_home, score_away, mv_home, mv_away, home_idx, away_idx
    
    
    # Convert two results scores (home & away) into a result classification (home win [1,0,0], draw [0,1,0] and away win [0,0,1])
    # The classifiaction uses a threshold defining the boundary between the three classes
    def scores_to_result(self, score_home, score_away, draw_thresh=0.3):
        result = torch.zeros(3)
        score_diff = score_home - score_away
        
        if score_diff > draw_thresh: 
            result[0] = 1
        elif score_diff < -draw_thresh: 
            result[2] = 1
        else:
            result[1] = 1
        return result
    
    
    # Return length of the dataset (number of completed matchdays)
    def __len__(self):
        length = (len(self.year_list)-1)*34
        length_current_season = round((self.get_results(self.year_list[-1]).notna().sum().sum())/9)
        return length + length_current_season
    
    
    # Get sample for the dataloader.
    # A sample is a dictionary with year ('year'), matchday ('matchday'), match list of the current matchday ('md_0') and the next ('md_1') 
    # Each match list contains all matches of the given matchday.
    # Each match contains the team names, team scores and market values of the home and away team.
    # Format per match: [home_team, away_team, score_home, score_away, mv_home, mv_away]
    def __getitem__(self, idx):
        if idx >= self.database_len-1:
            print('Warning: Index ' + str(idx) + ' exceeded database length')
        
        sample = dict()
        sample['md_0'] = list()
        sample['md_1'] = list()
        year, matchday = self.idx_to_year_md(idx)
        
        sample['year'] = year
        sample['matchday'] = matchday
        
        # Load data of the current matchday (md_0)
        match_list = self.get_match_list(year, matchday)
        for match in match_list:
            home_team = match[0]
            away_team = match[1]
            match_result = self.get_match_result(year, home_team, away_team)
            score_home,score_away = self.result_to_scores(match_result)
            mv_home = self.get_club_market_value(home_team, year, matchday)
            mv_away = self.get_club_market_value(away_team, year, matchday)
            sample['md_0'].append([home_team, away_team, score_home, score_away, mv_home, mv_away])
        
        # Load data of the next matchday (md_1)
        year, matchday = self.idx_to_year_md(idx+1)
        match_list = self.get_match_list(year, matchday)
        for match in match_list:
            home_team = match[0]
            away_team = match[1]
            match_result = self.get_match_result(year, home_team, away_team)
            score_home, score_away = self.result_to_scores(match_result)
            mv_home = self.get_club_market_value(home_team, year, matchday)
            mv_away = self.get_club_market_value(away_team, year, matchday)
            sample['md_1'].append([home_team, away_team, score_home, score_away, mv_home, mv_away])
        
        return sample

## Dataloader

In [None]:
dataset = FootballDataset(data_path)
dataloader = DataLoader(dataset)

# Model

In [None]:
### Football Prediction Model ###
# This model uses a LSTM followed by an MLP
class LSTM_MLP(torch.nn.Module):
    def __init__(self, n_features, n_cells, n_hidden_dim):
        super(LSTM_MLP, self).__init__()
        
        ## LSTM init
        self.n_features = n_features
        self.n_cells = n_cells
        self.n_hidden_dim = n_hidden_dim # number of hidden state dimension
        
        self.lstm_cell = nn.LSTMCell(self.n_features, self.n_hidden_dim)
        
        ## MLP init
        self.dim_input = self.n_hidden_dim*2
        self.dim_output = 2

        self.layer_1 = nn.Linear(self.dim_input, self.dim_input)
        self.layer_2 = nn.Linear(self.dim_input, self.dim_input)
        self.layer_3 = nn.Linear(self.dim_input, self.dim_output)
        
    
    # Initilize hidden states and cell states for all teams
    def lstm_init(self):
        self.hidden_state = torch.zeros((self.n_cells, self.n_hidden_dim), requires_grad=True)
        self.cell_state = torch.zeros((self.n_cells, self.n_hidden_dim), requires_grad=True)
        
    # Forward function
    def forward(self, home_input, away_input):
        # Detach hiiden states and cell states of the LSTM so that the gradient does not need to flow backwards through every iteration
        self.hidden_state.detach_()
        self.cell_state.detach_()
        
        # Prepare LSTM input consisting of the latest match score, market value and quality vector (LSTM hidden state) of the opponent
        away_idx = int(home_input[2].item())
        home_idx = int(away_input[2].item())
        home_input = torch.cat((home_input[0:2], self.hidden_state[away_idx]), 0)
        away_input = torch.cat((away_input[0:2], self.hidden_state[home_idx]), 0)
         
        # Estimate new club quality of home and away team
        (self.hidden_state[home_idx], self.cell_state[home_idx]) = self.lstm_cell(home_input.view(1,-1), 
                                                                                  (self.hidden_state[home_idx].clone().view(1,-1), 
                                                                                   self.cell_state[home_idx].clone().view(1,-1)
                                                                                  )
                                                                                 )
    
        (self.hidden_state[away_idx], self.cell_state[away_idx]) = self.lstm_cell(away_input.view(1,-1), 
                                                                                  (self.hidden_state[away_idx].clone().view(1,-1), 
                                                                                   self.cell_state[away_idx].clone().view(1,-1)
                                                                                  )
                                                                                 )
        
        # Predict match result of the upcoming match
        input_tensor = torch.cat((self.hidden_state[home_idx], self.hidden_state[away_idx]), 0)
        
        #out = self.ReLU(self.layer_1(input_tensor))
        out = self.layer_1(input_tensor)
        out = self.layer_2(out)
        out = self.layer_3(out)
        
        return out

### Create models and load them to the CPU

In [None]:
# Model parameters
n_hidden_dim = 10                     # Hidden state dimension of the LSTM
n_features = 2 + n_hidden_dim         # Input dimension of the LSTM (score, market value and hidden state)
n_cells = dataset.total_team_number   #

# create NNs
model = LSTM_MLP(n_features, n_cells, n_hidden_dim)
model = model.to(device)
    
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.99)

criterion = nn.L1Loss(reduction = 'sum')

print(model)

# Model training and prediction

In [None]:
def train(model, optimizer, scheduler, sample, printing=False):
    model.lstm_init()
    model.train()
    optimizer.zero_grad()
    
    torch.autograd.set_detect_anomaly(True)
    

    ## Prepare input tensor with dim [29x(1+1+lstm_hidden_dim)]
    # For each of the 29 clubs, the tensor holds the recent match score, market value and the quality tensor of the opponent
    score_tensor = torch.zeros(model.n_cells, 1, dtype=torch.float32)
    mv_tensor = torch.zeros(model.n_cells, 1, dtype=torch.float32)
    opponent_idx_tensor = torch.zeros(model.n_cells, 1)
    
    for n, match in enumerate(sample['md_0']):
        home_team, away_team, score_home, score_away, mv_home, mv_away, home_idx, away_idx = dataset.match_to_match_info(match)
        
        score_tensor[home_idx] = score_home
        score_tensor[away_idx] = score_away
        
        mv_tensor[home_idx] = mv_home
        mv_tensor[away_idx] = mv_away
        
        opponent_idx_tensor[home_idx] = away_idx
        opponent_idx_tensor[away_idx] = home_idx

    lstm_input_tensor = torch.cat((score_tensor, mv_tensor, opponent_idx_tensor), 1)
    
    
    ## Predict match results of the next matchday
    total_loss = 0
    accuracy = 0
    for n, match in enumerate(sample['md_1']):
        home_team, away_team, score_home, score_away, mv_home, mv_away, home_idx, away_idx = dataset.match_to_match_info(match)
        
        # Predict match result
        score_pred = model(lstm_input_tensor[home_idx], lstm_input_tensor[away_idx])
        
        # Calculate loss
        score = torch.cat((score_home, score_away), 0)
        score = torch.mul(score, 2)
        pred_loss = criterion(score_pred, score)
        total_loss += pred_loss
        
        # Classify the match result (home win, draw, away win)
        result = dataset.scores_to_result(score_home, score_away)
        result_pred = dataset.scores_to_result(score_pred[0], score_pred[1])
        
        # Track match result classification accuracy
        pred_corr = False
        if torch.argmax(result_pred) == torch.argmax(result):
            accuracy += 1
            pred_corr = True
        
        # Print predicted match results
        if printing is True:
            message = (
                f'{score_pred[0].item():.3f}, {score_pred[1].item():.3f},\t{score_home.item():.0f}, {score_away.item():.0f},'
                f'\t[{result_pred[0].item():.0f} {result_pred[1].item():.0f} {result_pred[2].item():.0f}]'
                f'\t[{result[0].item():.0f} {result[1].item():.0f} {result[2].item():.0f}]'
                f'\t{pred_corr}'
                f'\t{pred_loss.item():.2f},'
                f'\t{home_team}\t\t{away_team:>10}'
            )
            print(message)
    
    total_loss.backward()
    optimizer.step()
    scheduler.step()
    
    #for name, param in model.named_parameters():
    #    print(name, param.grad)
    
    mean_loss = total_loss  / len(sample['md_1'])
    accuracy /= len(sample['md_1'])

    return mean_loss.item(), accuracy

# Training

In [None]:
# Save mean average accuracy per season
season_acc_dict = dict()
for year in dataset.year_list:
    season_acc_dict[year] = 0

# Loop trough the whole dataset
for idx, sample in tqdm(enumerate(dataloader), total=len(dataloader)):
        year, matchday = dataset.idx_to_year_md(idx)
        
        # Model training and prediction
        curr_loss, curr_acc = train(model, optimizer, scheduler, sample, printing=True)
        
        # Track season accuracy
        if year == 2020: 
            season_acc_dict[year] += (curr_acc/22)
        else:
            season_acc_dict[year] += (curr_acc/34)
        
        year, matchday = dataset.idx_to_year_md(idx+1)
        if idx % 1 == 0:
            print(f'Iteration {idx}    Prediction of matchday {matchday} ({year})    Loss.:  {curr_loss:.2f}    Acc.:  {curr_acc:.2f}')

print('\n Overall acc. of all seasons:')
pprint(season_acc_dict)