# 0. Load dataset and preprocess 

### Imports

In [1]:
import time

import pandas as pd  # 1.5.3

### Function to load raw training dataset 

- Converts date column to pd.datetime format
- Sorts games from older to more recent ones in pd.DataFrame
- Adds new column: Total Goals of each game
- Cleans games with Total Goals < 0 (invalid)

In [5]:
def load_and_process_dataset(training_set_route):
    
    file_format = training_set_route.split(".")[-1]
    
    if file_format == "xlsx":
        start_time = time.time()
        training_set = pd.read_excel(training_set_route)
        print(f"Dataset loaded in {round(time.time() - start_time, 3)} seconds.")
    elif file_format == "parquet":
        start_time = time.time()
        training_set = pd.read_parquet(training_set_route)
        print(f"Dataset loaded in {round(time.time() - start_time, 3)} seconds.")
    else:
        raise Exception("File format is not csv nor parquet.")
    
    # Order dataset by date from older to more recent matches
    training_set["Date"] = pd.to_datetime(training_set["Date"], format="%d/%m/%Y")
    training_set = training_set.sort_values(by=["Date"])
    # Total goals in the match = HS + AS
    training_set["Goals"] = training_set["HS"] + training_set["AS"]
    # Drop invalid games (with negative total goals)
    training_set = training_set.drop(training_set[training_set["Goals"] < 0].index)
    
    return training_set

In [6]:
training_set = load_and_process_dataset("datasets/raw/TrainingSet_2023_02_08.xlsx")
training_set

Dataset loaded in 27.588 seconds.


Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,Goals
7102,00-01,CHN1,2000-03-19,Dalian Haichang,Beijing Guoan,2,0,2,W,2
7103,00-01,CHN1,2000-03-19,Qingdao,Shanghai Greenland,1,1,0,D,2
7108,00-01,CHN1,2000-03-19,Xiamen Xiaxin,Liaoning,3,0,3,W,3
7106,00-01,CHN1,2000-03-19,Shenzhen FC,Sichuan Guancheng,0,0,0,D,0
7105,00-01,CHN1,2000-03-19,Guangzhou Rich and Force,Chongqing Lifan,0,0,0,D,0
...,...,...,...,...,...,...,...,...,...,...
299041,22-23,TUN1,2023-02-08,EO Sidi Bouzid,ES Metlaoui,3,4,-1,L,7
299040,22-23,TUN1,2023-02-08,ES Sahel,CA Bizertin,0,2,-2,L,2
299039,22-23,TUN1,2023-02-08,US Ben Guerdane,Soliman,2,2,0,D,4
299037,22-23,TUN1,2023-02-08,Stade Tunisien,US Tataouine,2,1,1,W,3


In [7]:
# Much faster !
training_set = load_and_process_dataset("datasets/raw/training_set.parquet")
training_set

Dataset loaded in 0.141 seconds.


Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,Goals
7102,00-01,CHN1,2000-03-19,Dalian Haichang,Beijing Guoan,2,0,2,W,2
7103,00-01,CHN1,2000-03-19,Qingdao,Shanghai Greenland,1,1,0,D,2
7108,00-01,CHN1,2000-03-19,Xiamen Xiaxin,Liaoning,3,0,3,W,3
7106,00-01,CHN1,2000-03-19,Shenzhen FC,Sichuan Guancheng,0,0,0,D,0
7105,00-01,CHN1,2000-03-19,Guangzhou Rich and Force,Chongqing Lifan,0,0,0,D,0
...,...,...,...,...,...,...,...,...,...,...
299041,22-23,TUN1,2023-02-08,EO Sidi Bouzid,ES Metlaoui,3,4,-1,L,7
299040,22-23,TUN1,2023-02-08,ES Sahel,CA Bizertin,0,2,-2,L,2
299039,22-23,TUN1,2023-02-08,US Ben Guerdane,Soliman,2,2,0,D,4
299037,22-23,TUN1,2023-02-08,Stade Tunisien,US Tataouine,2,1,1,W,3


### Save to processed datasets directory

In [8]:
training_set.to_parquet("datasets/processed/training_set_processed.parquet")