In [66]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
#import xgboost as xgb
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [5]:
file_path = "C:/Users/samar/OneDrive/Desktop/mathapp/APST/dataChallenge"

In [6]:
away_player_train = file_path + "/Train_Data/train_away_player_statistics_df.csv"
away_team_train = file_path + "/Train_Data/train_away_team_statistics_df.csv"
home_player_train = file_path + "/Train_Data/train_home_player_statistics_df.csv"
home_team_train = file_path + "/Train_Data/train_home_team_statistics_df.csv"

In [7]:
away_player_test = file_path + "/Test_Data/test_away_player_statistics_df.csv"
away_team_test = file_path + "/Test_Data/test_away_team_statistics_df.csv"
home_player_test = file_path + "/Test_Data/test_home_player_statistics_df.csv"
home_team_test = file_path + "/Test_Data/test_home_team_statistics_df.csv"

In [8]:
Y_train_supp = file_path + "/benchmark_and_extras/Y_train_supp.csv"
Y_train = file_path + "/Y_train.csv"
Y_test_random = file_path + "/Y_test_random.csv"

Challenge context

Over the last two decades, professional sports around the world have adapted towards a data-driven approach to their decision-making. Sports analytics are part of live broadcasts, fantasy sports, and every-day discussions. This growth has been fueled by an exponential evolution in sports data.

Data science and machine learning can be useful to tackle the growing field of sports analytics. It can be used by fantasy league players and professional gamblers alike to make better informed decisions. Sports betting websites have become quite sophisticated in this area in the last few years. Models can also be used by managers of professional sports teams and recruiters to build rosters and strategically deploy players in a way that increases the team’s chance of winning.

Football has been at the heart of the sports analytics revolution. All types of statistics, both historical and real-time, are available. This challenge leverages Football data obtained from Sportmonks, a top-tier sports data provider widely used to enhance various online applications and websites. For additional details, feel free to explore sportmonks.com.

Feel free to visit and register to our dedicated forum at challengedata.qube-rt.com for more information about the challenge, the data and QRT.

Challenge goals

As this year’s QRT data challenge, we propose a match result prediction challenge. You will be provided with real historical data at the player, team and league level, and be asked to predict which team wins, or if there is a draw.

We have data for many leagues around the world and at different divisions. Your goal is to build a rich predictive models that can work in any football league regardless of competitive level or geographical location.

Data description

We provide data at the team and player level for dozens of football leagues.

The data comes packed in two zip files, X_train.zip, and X_test.zip, as well as two csv files Y_train.csv, and Y_train_supp.csv

The zip files contain the input data, which is divided into 4 csv files. The data is separated into HOME and AWAY team statistics, which is aggregated at the team and player level. All statistics come from real historical matches. They are summaries of the last 5 games prior to the match, as well as season-to-date statistics of the game being predicted.

The ID column links tables in X_train, with Y_train and Y_train_supp. The same holds true for the test data.

Input team data sets comprise the following 3 identifier columns:

ID, LEAGUE and TEAM_NAME (note that LEAGUE and TEAM_NAME are not included in the test data)
The following 25 statistics, which are aggregated by sum, average and standard deviation.

'TEAM_ATTACKS'
'TEAM_BALL_POSSESSION'
'TEAM_BALL_SAFE'
'TEAM_CORNERS'
'TEAM_DANGEROUS_ATTACKS'
'TEAM_FOULS'
'TEAM_GAME_DRAW'
'TEAM_GAME_LOST'
'TEAM_GAME_WON'
'TEAM_GOALS'
'TEAM_INJURIES'
'TEAM_OFFSIDES'
'TEAM_PASSES'
'TEAM_PENALTIES'
'TEAM_REDCARDS'
'TEAM_SAVES'
'TEAM_SHOTS_INSIDEBOX'
'TEAM_SHOTS_OFF_TARGET'
'TEAM_SHOTS_ON_TARGET',
'TEAM_SHOTS_OUTSIDEBOX'
'TEAM_SHOTS_TOTAL'
'TEAM_SUBSTITUTIONS'
'TEAM_SUCCESSFUL_PASSES'
'TEAM_SUCCESSFUL_PASSES_PERCENTAGE'
'TEAM_YELLOWCARDS'
Input player data sets comprise the following 3 identifier columns:

ID, LEAGUE and TEAM_NAME, POSITION and PLAYER_NAME (note that LEAGUE, TEAM_NAME, and PLAYER_NAME are not included in the test data)
52 statistics, which are aggregated by sum, average and standard deviation. They are similar to the team statistics though more fine-grained.

Output data sets are composed of 4 columns:

ID: Unique row identifier - corresponding to the input identifiers,
HOME_WINS,
DRAW,
AWAY_WINS,
The target score is the accuracy of prediction for the vector [HOME_WINS, DRAW,AWAY_WINS], for which there are three possible choices, [1,0,0]. [0,1,0] and [0,0,1].

All variables have been standardized and team/players/and league names have been removed from the test set. We have provided as much data in the train set as possible for your convenience. We expect from you to not use any external data, which can lead to disqualification.

An example of submission file containing random predictions is provided - see also the notebook in the supplementary material section for the benchmark output.

We have included other alternate training targets, such as GOAL_DIFF_HOME_AWAY, which is the difference of goals between the HOME and AWAY team, in the Y_train_supp file in case you want to train richer models with different targets.

Disclaimer: The data provided is exclusively intended for use in this challenge, and any usage of this dataset for other purposes is strictly prohibited. General terms or service are applicable: terms of service

## Read  team train data

In [14]:
#Read  team train data

train_home_team_statistics_df = pd.read_csv(home_team_train, index_col=0)
train_away_team_statistics_df = pd.read_csv(away_team_train, index_col=0)

train_scores = pd.read_csv(Y_train, index_col=0)

train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]

train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

train_data =  pd.concat([train_home,train_away],join='inner',axis=1)
train_scores = train_scores.loc[train_data.index]

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})

In [15]:
train_home_team_statistics_df.head()

Unnamed: 0_level_0,LEAGUE,TEAM_NAME,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_OFF_TARGET_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_SHOTS_OUTSIDEBOX_season_sum,TEAM_PASSES_season_sum,TEAM_SUCCESSFUL_PASSES_season_sum,TEAM_SAVES_season_sum,...,TEAM_YELLOWCARDS_5_last_match_std,TEAM_REDCARDS_5_last_match_std,TEAM_OFFSIDES_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_SUBSTITUTIONS_5_last_match_std,TEAM_BALL_SAFE_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_INJURIES_5_last_match_std,TEAM_GOALS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,...,3.0,0.0,6.0,0.0,10.0,8.0,7.0,2.0,4.0,3.0
1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,...,4.0,0.0,4.0,3.0,10.0,0.0,1.0,2.0,8.0,4.0
2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,...,4.0,5.0,6.0,3.0,6.0,7.0,2.0,3.0,2.0,4.0
3,League One,Coventry City,7.0,5.0,5.0,6.0,6.0,9.0,9.0,2.0,...,4.0,0.0,1.0,8.0,8.0,5.0,5.0,5.0,,6.0
4,Premier League,Wolverhampton Wanderers,3.0,3.0,2.0,3.0,4.0,4.0,3.0,4.0,...,1.0,0.0,2.0,5.0,8.0,7.0,2.0,6.0,4.0,4.0


In [16]:
train_data.head()

Unnamed: 0_level_0,HOME_TEAM_SHOTS_TOTAL_season_sum,HOME_TEAM_SHOTS_INSIDEBOX_season_sum,HOME_TEAM_SHOTS_OFF_TARGET_season_sum,HOME_TEAM_SHOTS_ON_TARGET_season_sum,HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum,HOME_TEAM_PASSES_season_sum,HOME_TEAM_SUCCESSFUL_PASSES_season_sum,HOME_TEAM_SAVES_season_sum,HOME_TEAM_CORNERS_season_sum,HOME_TEAM_FOULS_season_sum,...,AWAY_TEAM_YELLOWCARDS_5_last_match_std,AWAY_TEAM_REDCARDS_5_last_match_std,AWAY_TEAM_OFFSIDES_5_last_match_std,AWAY_TEAM_ATTACKS_5_last_match_std,AWAY_TEAM_PENALTIES_5_last_match_std,AWAY_TEAM_SUBSTITUTIONS_5_last_match_std,AWAY_TEAM_BALL_SAFE_5_last_match_std,AWAY_TEAM_DANGEROUS_ATTACKS_5_last_match_std,AWAY_TEAM_INJURIES_5_last_match_std,AWAY_TEAM_GOALS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,3.0,6.0,...,5.0,5.0,4.0,0.0,6.0,8.0,4.0,3.0,2.0,3.0
1,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,6.0,8.0,...,0.0,0.0,3.0,1.0,8.0,4.0,10.0,0.0,5.0,3.0
2,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,2.0,7.0,...,6.0,10.0,4.0,4.0,0.0,8.0,3.0,0.0,9.0,6.0
3,7.0,5.0,5.0,6.0,6.0,9.0,9.0,2.0,2.0,0.0,...,0.0,0.0,1.0,2.0,0.0,5.0,6.0,3.0,,2.0
4,3.0,3.0,2.0,3.0,4.0,4.0,3.0,4.0,4.0,7.0,...,1.0,0.0,4.0,4.0,9.0,4.0,1.0,4.0,6.0,5.0


In [17]:
train_scores.head()

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,1
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0


## Read player train data

In [None]:

train_home_player_statistics_df = pd.read_csv(home_player_train, index_col=0)
train_away_player_statistics_df = pd.read_csv(away_player_train, index_col=0)

train_player_home = train_home_player_statistics_df.iloc[:,:]
train_player_away = train_away_player_statistics_df.iloc[:,:]

train_player_home.columns = 'HOME_' + train_player_home.columns
train_player_away.columns = 'AWAY_' + train_player_away.columns

train_player_data = pd.merge(
    train_player_home,
    train_player_away,
    left_on=['HOME_TEAM_NAME', 'HOME_PLAYER_NAME'],
    right_on=['HOME_TEAM_NAME', 'AWAY_PLAYER_NAME'],
    how='inner'
)
train_player_data = train_player_data.replace({np.inf:np.nan,-np.inf:np.nan})

# Group by team and sum the player points to predict team performance
team_performance = train_player_data.groupby(['ID']).sum()

In [57]:
train_player_data.head()

Unnamed: 0_level_0,HOME_LEAGUE,HOME_TEAM_NAME,HOME_POSITION,HOME_PLAYER_NAME,HOME_PLAYER_ACCURATE_CROSSES_season_sum,HOME_PLAYER_ACCURATE_PASSES_season_sum,HOME_PLAYER_AERIALS_WON_season_sum,HOME_PLAYER_ASSISTS_season_sum,HOME_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_PLAYER_BIG_CHANCES_MISSED_season_sum,...,AWAY_PLAYER_STARTING_LINEUP_5_last_match_std,AWAY_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,AWAY_PLAYER_TACKLES_5_last_match_std,AWAY_PLAYER_TOTAL_CROSSES_5_last_match_std,AWAY_PLAYER_TOTAL_DUELS_5_last_match_std,AWAY_PLAYER_YELLOWCARDS_5_last_match_std,AWAY_PLAYER_PUNCHES_5_last_match_std,AWAY_PLAYER_LONG_BALLS_5_last_match_std,AWAY_PLAYER_LONG_BALLS_WON_5_last_match_std,AWAY_PLAYER_SHOTS_OFF_TARGET_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
232,Serie A,Grêmio,midfielder,Douglas,9.0,21.0,5.0,9.0,7.0,6.0,...,0.0,0.0,16.0,0.0,33.0,0.0,,,,
303,Serie A,Grêmio,midfielder,Patrick,10.0,5.0,0.0,0.0,0.0,0.0,...,0.0,37.0,46.0,30.0,56.0,50.0,,,,
455,Serie A,Juventus,defender,Danilo,13.0,99.0,27.0,37.0,25.0,0.0,...,0.0,0.0,15.0,8.0,31.0,0.0,0.0,,,
723,Liga Portugal,Santa Clara,midfielder,Costinha,12.0,13.0,5.0,25.0,15.0,0.0,...,63.0,30.0,25.0,26.0,63.0,0.0,0.0,,,
727,Liga Portugal,Sporting Braga,attacker,Paulinho,11.0,18.0,37.0,50.0,45.0,100.0,...,77.0,58.0,39.0,18.0,50.0,0.0,,,,


In [47]:
team_performance.head()

Unnamed: 0_level_0,HOME_PLAYER_ACCURATE_CROSSES_season_sum,HOME_PLAYER_ACCURATE_PASSES_season_sum,HOME_PLAYER_AERIALS_WON_season_sum,HOME_PLAYER_ASSISTS_season_sum,HOME_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_PLAYER_BIG_CHANCES_MISSED_season_sum,HOME_PLAYER_BLOCKED_SHOTS_season_sum,HOME_PLAYER_CAPTAIN_season_sum,HOME_PLAYER_CLEARANCES_season_sum,HOME_PLAYER_CLEARANCE_OFFLINE_season_sum,...,AWAY_PLAYER_STARTING_LINEUP_5_last_match_std,AWAY_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,AWAY_PLAYER_TACKLES_5_last_match_std,AWAY_PLAYER_TOTAL_CROSSES_5_last_match_std,AWAY_PLAYER_TOTAL_DUELS_5_last_match_std,AWAY_PLAYER_YELLOWCARDS_5_last_match_std,AWAY_PLAYER_PUNCHES_5_last_match_std,AWAY_PLAYER_LONG_BALLS_5_last_match_std,AWAY_PLAYER_LONG_BALLS_WON_5_last_match_std,AWAY_PLAYER_SHOTS_OFF_TARGET_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
232,9.0,21.0,5.0,9.0,7.0,6.0,0.0,17.0,0.0,0.0,...,0.0,0.0,16.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0
303,10.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,37.0,46.0,30.0,56.0,50.0,0.0,0.0,0.0,0.0
455,13.0,99.0,27.0,37.0,25.0,0.0,21.0,0.0,38.0,0.0,...,0.0,0.0,15.0,8.0,31.0,0.0,0.0,0.0,0.0,0.0
723,12.0,13.0,5.0,25.0,15.0,0.0,0.0,0.0,3.0,0.0,...,63.0,30.0,25.0,26.0,63.0,0.0,0.0,0.0,0.0,0.0
727,11.0,18.0,37.0,50.0,45.0,100.0,3.0,0.0,7.0,0.0,...,77.0,58.0,39.0,18.0,50.0,0.0,0.0,0.0,0.0,0.0


In [81]:

home_players = pd.read_csv(home_player_train)
away_players = pd.read_csv(away_player_train)

# Aggregate player stats by team for each match
home_team_stats = home_players.groupby('ID').mean().fillna(0)
away_team_stats = away_players.groupby('ID').mean().fillna(0)

In [82]:
# Reset indices and merge datasets
home_team_stats.reset_index(inplace=True)
away_team_stats.reset_index(inplace=True)

# Merge on match ID
team_comparison = pd.merge(home_team_stats, away_team_stats, on='ID', suffixes=('_home', '_away'))

In [83]:
team_comparison.head()

Unnamed: 0,ID,PLAYER_ACCURATE_CROSSES_season_sum_home,PLAYER_ACCURATE_PASSES_season_sum_home,PLAYER_AERIALS_WON_season_sum_home,PLAYER_ASSISTS_season_sum_home,PLAYER_BIG_CHANCES_CREATED_season_sum_home,PLAYER_BIG_CHANCES_MISSED_season_sum_home,PLAYER_BLOCKED_SHOTS_season_sum_home,PLAYER_CAPTAIN_season_sum_home,PLAYER_CLEARANCES_season_sum_home,...,PLAYER_STARTING_LINEUP_5_last_match_std_away,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std_away,PLAYER_TACKLES_5_last_match_std_away,PLAYER_TOTAL_CROSSES_5_last_match_std_away,PLAYER_TOTAL_DUELS_5_last_match_std_away,PLAYER_YELLOWCARDS_5_last_match_std_away,PLAYER_PUNCHES_5_last_match_std_away,PLAYER_LONG_BALLS_5_last_match_std_away,PLAYER_LONG_BALLS_WON_5_last_match_std_away,PLAYER_SHOTS_OFF_TARGET_5_last_match_std_away
0,0,10.222222,21.333333,15.277778,7.777778,7.222222,6.166667,9.222222,45.5,16.722222,...,25.117647,23.764706,24.529412,15.941176,32.470588,39.529412,0.0,0.0,0.0,0.0
1,1,9.222222,31.555556,14.555556,16.666667,10.277778,9.0,11.888889,50.0,17.166667,...,50.555556,11.722222,24.388889,15.055556,32.333333,29.166667,0.0,0.0,0.0,0.0
2,2,8.434783,16.608696,15.347826,2.826087,3.652174,3.869565,8.826087,3.333333,12.826087,...,25.272727,18.681818,29.727273,10.681818,33.545455,37.545455,0.0,0.0,0.0,0.0
3,3,8.888889,34.055556,15.833333,12.0,14.444444,8.722222,7.444444,19.75,19.333333,...,16.625,23.875,23.0,9.75,30.375,20.5625,0.0,0.0,0.0,0.0
4,4,9.722222,25.888889,15.111111,8.166667,9.5,10.666667,9.611111,100.0,17.555556,...,26.833333,9.166667,19.055556,10.944444,22.666667,21.388889,0.0,0.0,0.0,0.0


In [84]:
for column_name in team_comparison.columns:
    if column_name.endswith('_home'):  # Ensure we're processing only '_home' columns
        prefix = column_name[:-5]  # Remove '_home' to get the base column name
        team_comparison[prefix + '_diff'] = (
            team_comparison[column_name] - team_comparison[prefix + '_away']
        )

In [85]:
team_comparison.head()

Unnamed: 0,ID,PLAYER_ACCURATE_CROSSES_season_sum_home,PLAYER_ACCURATE_PASSES_season_sum_home,PLAYER_AERIALS_WON_season_sum_home,PLAYER_ASSISTS_season_sum_home,PLAYER_BIG_CHANCES_CREATED_season_sum_home,PLAYER_BIG_CHANCES_MISSED_season_sum_home,PLAYER_BLOCKED_SHOTS_season_sum_home,PLAYER_CAPTAIN_season_sum_home,PLAYER_CLEARANCES_season_sum_home,...,PLAYER_STARTING_LINEUP_5_last_match_std_diff,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std_diff,PLAYER_TACKLES_5_last_match_std_diff,PLAYER_TOTAL_CROSSES_5_last_match_std_diff,PLAYER_TOTAL_DUELS_5_last_match_std_diff,PLAYER_YELLOWCARDS_5_last_match_std_diff,PLAYER_PUNCHES_5_last_match_std_diff,PLAYER_LONG_BALLS_5_last_match_std_diff,PLAYER_LONG_BALLS_WON_5_last_match_std_diff,PLAYER_SHOTS_OFF_TARGET_5_last_match_std_diff
0,0,10.222222,21.333333,15.277778,7.777778,7.222222,6.166667,9.222222,45.5,16.722222,...,11.529412,0.294118,0.823529,-1.235294,6.705882,2.235294,0.0,0.0,0.0,0.0
1,1,9.222222,31.555556,14.555556,16.666667,10.277778,9.0,11.888889,50.0,17.166667,...,-47.055556,-1.722222,-10.777778,-6.388889,-10.277778,-9.333333,0.0,0.0,0.0,0.0
2,2,8.434783,16.608696,15.347826,2.826087,3.652174,3.869565,8.826087,3.333333,12.826087,...,28.363636,-5.681818,-5.772727,0.772727,1.636364,-0.954545,0.0,0.0,0.0,0.0
3,3,8.888889,34.055556,15.833333,12.0,14.444444,8.722222,7.444444,19.75,19.333333,...,3.208333,-15.041667,-3.888889,7.305556,-10.763889,-5.006944,0.0,0.0,0.0,0.0
4,4,9.722222,25.888889,15.111111,8.166667,9.5,10.666667,9.611111,100.0,17.555556,...,-7.480392,10.068627,5.297386,4.46732,-0.54902,-6.800654,0.0,0.0,0.0,0.0


In [86]:
# Load labels
labels = pd.read_csv(Y_train)

# Merge with features
data_with_labels = pd.merge(team_comparison, labels, on='ID')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split data
X = data_with_labels.drop(['ID', 'HOME_WINS', 'DRAW', 'AWAY_WINS'], axis=1)
y = data_with_labels[['HOME_WINS', 'DRAW', 'AWAY_WINS']].values.argmax(axis=1)  # Convert to single-label

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model # Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))

NameError: name 'class_weights' is not defined

In [89]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_preds)
print("Accuracy:", rf_accuracy)

Accuracy: 0.48598130841121495


In [None]:
#accuracy is high we have to amelorate it

In [90]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter Tuning, Example for Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
grid_search_rf.fit(X_train, y_train)
print(f"Best Random Forest Params: {grid_search_rf.best_params_}")

Best Random Forest Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
# Cross-validation

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(GradientBoostingClassifier(), X, y, cv=5)
print(f"Cross-validation Accuracy: {cv_scores.mean()}")