In [45]:
import pandas as pd
import numpy as np
import glob
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV




Data is present in Epl/data/ and seperate features required are taken to reduce the difficulties in cleaning..empty coloumns if any are filled with NaN.

In [6]:
# Load only Premier League CSVs (E0)
files = glob.glob("C:/Epl/data/E0*.csv")
print(files)
features=["Date","HomeTeam","AwayTeam","HTHG","HTAG","HS","AS","HST","AST","HF","AF","HC","AC","HY","AY","HR","AR"]
target ="FTR"
dfs = []
for f in files:
    df = pd.read_csv(f, encoding='cp1252', on_bad_lines='skip')
    for col in features:
        if col not in df.columns:
         df[col] = pd.NA
    df = df[features + [target]]
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")
print("Shape:", data.shape)
print(data.head())


['C:/Epl/data\\E02000.csv', 'C:/Epl/data\\E02001.csv', 'C:/Epl/data\\E02002.csv', 'C:/Epl/data\\E02003.csv', 'C:/Epl/data\\E02004.csv', 'C:/Epl/data\\E02005.csv', 'C:/Epl/data\\E02006.csv', 'C:/Epl/data\\E02007.csv', 'C:/Epl/data\\E02008.csv', 'C:/Epl/data\\E02009.csv', 'C:/Epl/data\\E02010.csv', 'C:/Epl/data\\E02011.csv', 'C:/Epl/data\\E02012.csv', 'C:/Epl/data\\E02013.csv', 'C:/Epl/data\\E02014.csv', 'C:/Epl/data\\E02015.csv', 'C:/Epl/data\\E02016.csv', 'C:/Epl/data\\E02017.csv', 'C:/Epl/data\\E02018.csv', 'C:/Epl/data\\E02019.csv', 'C:/Epl/data\\E02020.csv', 'C:/Epl/data\\E02021.csv', 'C:/Epl/data\\E02022.csv', 'C:/Epl/data\\E02023.csv', 'C:/Epl/data\\E02024.csv']
Shape: (9411, 18)
        Date  HomeTeam       AwayTeam  HTHG  HTAG    HS    AS   HST  AST  \
0 2000-08-19  Charlton       Man City   2.0   0.0  17.0   8.0  14.0  4.0   
1 2000-08-19   Chelsea       West Ham   1.0   0.0  17.0  12.0  10.0  5.0   
2 2000-08-19  Coventry  Middlesbrough   1.0   1.0   6.0  16.0   3.0  9.0   
3 

  data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")


Now that data is ready we have to clean it by checking for duplicates and null values.

In [7]:
print(data.isnull().sum())
print((data.isnull().mean()*100).round(2))

Date        1
HomeTeam    1
AwayTeam    1
HTHG        1
HTAG        1
HS          1
AS          1
HST         1
AST         1
HF          1
AF          1
HC          1
AC          1
HY          1
AY          1
HR          1
AR          1
FTR         1
dtype: int64
Date        0.01
HomeTeam    0.01
AwayTeam    0.01
HTHG        0.01
HTAG        0.01
HS          0.01
AS          0.01
HST         0.01
AST         0.01
HF          0.01
AF          0.01
HC          0.01
AC          0.01
HY          0.01
AY          0.01
HR          0.01
AR          0.01
FTR         0.01
dtype: float64


This shows that the dataset has only a very few negligable output there is 1 missing value and that missing value exists in all features so we could drop that row.

In [8]:
data = data.dropna()
print("After dropping missing:",data.shape)
print("Remaining NaN:",data.isnull().sum().sum())

After dropping missing: (9410, 18)
Remaining NaN: 0


In [9]:
duplicates = data.duplicated().sum()
print("Number of duplicated rows:",duplicates)
data.to_csv("match_winner_cleaned.csv",index = False)

Number of duplicated rows: 0


In [10]:
categorical_col = df.select_dtypes(include ='object').columns
encoders = {}
for c in categorical_col:
    le = LabelEncoder()
    df[c]=le.fit_transform(df[c])
    encoders[c]=le
print(df.head())

   Date  HomeTeam  AwayTeam  HTHG  HTAG  HS  AS  HST  AST  HF  AF  HC  AC  HY  \
0    56        13         8     0     0  14  10    5    2  12  10   7   8   2   
1    58         9        11     0     0   7  18    2    5   9  18   2  10   3   
2    58         0        19     1     0  18   9    6    3  17  14   8   2   2   
3    58         7         4     0     1   9  10    1    5   8   8   1   5   1   
4    58        14        16     1     0   3  19    1    4  15  16   3  12   2   

   AY  HR  AR  FTR  
0   3   0   0    2  
1   1   0   0    0  
2   2   0   0    2  
3   1   1   0    0  
4   4   1   0    2  


I have done label encoding for all categorical coloumns and the encoder function is also saved in encoders[] to decode for further use

In [11]:
#splitting data into label and feature input

y = df['FTR']

X = df.drop('FTR', axis=1)


In [43]:

# Split data: 80% training, 20% testing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Stratified split to preserve class ratios
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_split, y_train_split)
X_train_res = X_train_res.astype(np.float32)
y_train_res = y_train_res.astype(np.int64)
print(X_train_res.shape)
print(y_train_res.shape)




print("Class distribution after SMOTE:", dict(zip(*np.unique(y_train_res, return_counts=True))))


(372, 17)
(372,)
Class distribution after SMOTE: {np.int64(0): np.int64(124), np.int64(1): np.int64(124), np.int64(2): np.int64(124)}


Random state fixed as 42 for accuracy and hyperparameter tuning and shufling is done 

In [46]:
weights = compute_sample_weight(class_weight='balanced', y=y_train_res)
model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    objective='multi:softprob',  # for 3-class classification
    num_class=3,
    eval_metric='mlogloss',
    random_state = 42
)
param_grid = {
    'n_estimators': [500, 700, 1000],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_macro',   # macro F1 helps balance all classes
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_res, y_train_res)
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
#model.fit(X_train, y_train,sample_weight=weights)
#y_pred = model.predict(X_test)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [40]:
from sklearn.metrics import f1_score

# Get predicted probabilities
y_proba = best_model.predict_proba(X_test_split)

# Custom thresholds per class
thresholds = [0.5, 0.5, 0.5]  # start with 0.5 for all classes

def predict_with_thresholds(probs, thresholds):
    preds = []
    for p in probs:
        # Assign class if probability > threshold, else take max
        assigned = [i for i, prob in enumerate(p) if prob >= thresholds[i]]
        if assigned:
            preds.append(assigned[0])
        else:
            preds.append(np.argmax(p))
    return np.array(preds)

y_pred = predict_with_thresholds(y_proba, thresholds)

print("F1 score macro:", f1_score(y_test_split, y_pred, average='macro'))

NameError: name 'best_model' is not defined

In [35]:


# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Full classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.618421052631579
              precision    recall  f1-score   support

           0       0.77      0.65      0.71        26
           1       0.38      0.32      0.34        19
           2       0.63      0.77      0.70        31

    accuracy                           0.62        76
   macro avg       0.59      0.58      0.58        76
weighted avg       0.62      0.62      0.61        76

