In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [182]:
df = pd.read_csv("Match_winner.csv")
df.head()

Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards
0,2000/01,2000-08-19,Charlton,Man City,4,0,H,2,0,H,...,14,4,6,6,13,12,1,2,0,0
1,2000/01,2000-08-19,Chelsea,West Ham,4,2,H,1,0,H,...,10,5,7,7,19,14,1,2,0,0
2,2000/01,2000-08-19,Coventry,Middlesbrough,1,3,A,1,1,D,...,3,9,8,4,15,21,5,3,1,0
3,2000/01,2000-08-19,Derby,Southampton,2,2,D,1,2,A,...,4,6,5,8,11,13,1,1,0,0
4,2000/01,2000-08-19,Leeds,Everton,2,0,H,2,0,H,...,8,6,6,4,21,20,1,3,0,0


In [183]:
df.columns

Index(['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals',
       'FullTimeAwayGoals', 'FullTimeResult', 'HalfTimeHomeGoals',
       'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots',
       'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners',
       'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards',
       'HomeRedCards', 'AwayRedCards'],
      dtype='object')

In [184]:
df.describe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9380 entries, 0 to 9379
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Season             9380 non-null   object
 1   MatchDate          9380 non-null   object
 2   HomeTeam           9380 non-null   object
 3   AwayTeam           9380 non-null   object
 4   FullTimeHomeGoals  9380 non-null   int64 
 5   FullTimeAwayGoals  9380 non-null   int64 
 6   FullTimeResult     9380 non-null   object
 7   HalfTimeHomeGoals  9380 non-null   int64 
 8   HalfTimeAwayGoals  9380 non-null   int64 
 9   HalfTimeResult     9380 non-null   object
 10  HomeShots          9380 non-null   int64 
 11  AwayShots          9380 non-null   int64 
 12  HomeShotsOnTarget  9380 non-null   int64 
 13  AwayShotsOnTarget  9380 non-null   int64 
 14  HomeCorners        9380 non-null   int64 
 15  AwayCorners        9380 non-null   int64 
 16  HomeFouls          9380 non-null   int64 


In [185]:
# Dropping unnecessary columns that are not required for the analysis
df=df.drop(['Season', 'MatchDate','FullTimeHomeGoals','FullTimeAwayGoals','HomeShots','AwayShots','HomeShotsOnTarget','AwayShotsOnTarget','HomeCorners','AwayCorners',
       'HomeFouls','AwayFouls','HomeYellowCards','AwayYellowCards','HomeRedCards','AwayRedCards'],axis=1)

In [186]:
# Print the remaining column names
print(df.columns)

Index(['HomeTeam', 'AwayTeam', 'FullTimeResult', 'HalfTimeHomeGoals',
       'HalfTimeAwayGoals', 'HalfTimeResult'],
      dtype='object')


In [187]:
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult
0,Charlton,Man City,H,2,0,H
1,Chelsea,West Ham,H,1,0,H
2,Coventry,Middlesbrough,A,1,1,D
3,Derby,Southampton,D,1,2,A
4,Leeds,Everton,H,2,0,H


In [188]:
#Check again for missing values and duplicate values
print(df.isnull().sum())
print(df.duplicated().sum())

HomeTeam             0
AwayTeam             0
FullTimeResult       0
HalfTimeHomeGoals    0
HalfTimeAwayGoals    0
HalfTimeResult       0
dtype: int64
2704


In [189]:
df = df.drop_duplicates()

In [190]:
print(df.duplicated().sum())

0


In [191]:
#Label_Encoding
le = LabelEncoder()
for col in ['FullTimeResult', 'HalfTimeResult']:
    df[col] = le.fit_transform(df[col])


In [192]:
#display dataset after applying label encoding
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult
0,Charlton,Man City,2,2,0,2
1,Chelsea,West Ham,2,1,0,2
2,Coventry,Middlesbrough,0,1,1,1
3,Derby,Southampton,1,1,2,0
4,Leeds,Everton,2,2,0,2


In [193]:
#One hot encoding for this dataset we prefer onehot encoding because the categorical values are not hierarchial or not in any order
df = pd.get_dummies(df, columns=['HomeTeam', 'AwayTeam'], prefix=['Home', 'Away'])

In [194]:
df.head()

Unnamed: 0,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,Home_Arsenal,Home_Aston Villa,Home_Birmingham,Home_Blackburn,Home_Blackpool,Home_Bolton,...,Away_Southampton,Away_Stoke,Away_Sunderland,Away_Swansea,Away_Tottenham,Away_Watford,Away_West Brom,Away_West Ham,Away_Wigan,Away_Wolves
0,2,2,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,1,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0,1,1,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,1,2,0,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,2,2,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [195]:
#display whole dataset
df

Unnamed: 0,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,Home_Arsenal,Home_Aston Villa,Home_Birmingham,Home_Blackburn,Home_Blackpool,Home_Bolton,...,Away_Southampton,Away_Stoke,Away_Sunderland,Away_Swansea,Away_Tottenham,Away_Watford,Away_West Brom,Away_West Ham,Away_Wigan,Away_Wolves
0,2,2,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,1,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0,1,1,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,1,2,0,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,2,2,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9367,1,1,0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9369,0,0,1,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9372,1,2,1,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9374,0,1,0,2,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [196]:
#Checking correlation
corr = df.corr(numeric_only=True)
print(corr)

                   FullTimeResult  HalfTimeHomeGoals  HalfTimeAwayGoals  \
FullTimeResult           1.000000           0.419178          -0.387178   
HalfTimeHomeGoals        0.419178           1.000000          -0.046392   
HalfTimeAwayGoals       -0.387178          -0.046392           1.000000   
HalfTimeResult           0.545820           0.664657          -0.634746   
Home_Arsenal             0.090480           0.068480          -0.023007   
...                           ...                ...                ...   
Away_Watford             0.031311           0.011519          -0.020107   
Away_West Brom           0.031611           0.016027          -0.041715   
Away_West Ham            0.001871           0.014969           0.011039   
Away_Wigan               0.021204          -0.013037          -0.022229   
Away_Wolves              0.014129           0.013936          -0.020386   

                   HalfTimeResult  Home_Arsenal  Home_Aston Villa  \
FullTimeResult           0.545

In [197]:
target_corr = corr["FullTimeResult"].sort_values(ascending=False)
print("Correlation with FullTimeResult:\n", target_corr)

Correlation with FullTimeResult:
 FullTimeResult       1.000000
HalfTimeResult       0.545820
HalfTimeHomeGoals    0.419178
Home_Arsenal         0.090480
Home_Liverpool       0.089593
                       ...   
Away_Liverpool      -0.070176
Away_Chelsea        -0.080624
Away_Man United     -0.083096
Away_Arsenal        -0.090965
HalfTimeAwayGoals   -0.387178
Name: FullTimeResult, Length: 96, dtype: float64


In [198]:
df.columns

Index(['FullTimeResult', 'HalfTimeHomeGoals', 'HalfTimeAwayGoals',
       'HalfTimeResult', 'Home_Arsenal', 'Home_Aston Villa', 'Home_Birmingham',
       'Home_Blackburn', 'Home_Blackpool', 'Home_Bolton', 'Home_Bournemouth',
       'Home_Bradford', 'Home_Brentford', 'Home_Brighton', 'Home_Burnley',
       'Home_Cardiff', 'Home_Charlton', 'Home_Chelsea', 'Home_Coventry',
       'Home_Crystal Palace', 'Home_Derby', 'Home_Everton', 'Home_Fulham',
       'Home_Huddersfield', 'Home_Hull', 'Home_Ipswich', 'Home_Leeds',
       'Home_Leicester', 'Home_Liverpool', 'Home_Luton', 'Home_Man City',
       'Home_Man United', 'Home_Middlesbrough', 'Home_Newcastle',
       'Home_Norwich', 'Home_Nott'm Forest', 'Home_Portsmouth', 'Home_QPR',
       'Home_Reading', 'Home_Sheffield United', 'Home_Southampton',
       'Home_Stoke', 'Home_Sunderland', 'Home_Swansea', 'Home_Tottenham',
       'Home_Watford', 'Home_West Brom', 'Home_West Ham', 'Home_Wigan',
       'Home_Wolves', 'Away_Arsenal', 'Away_Aston V

In [199]:
df.drop(['HalfTimeHomeGoals', 'HalfTimeAwayGoals','HalfTimeResult'],axis=1)

Unnamed: 0,FullTimeResult,Home_Arsenal,Home_Aston Villa,Home_Birmingham,Home_Blackburn,Home_Blackpool,Home_Bolton,Home_Bournemouth,Home_Bradford,Home_Brentford,...,Away_Southampton,Away_Stoke,Away_Sunderland,Away_Swansea,Away_Tottenham,Away_Watford,Away_West Brom,Away_West Ham,Away_Wigan,Away_Wolves
0,2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,2,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9367,1,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
9369,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9372,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9374,0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [200]:
y = df['FullTimeResult']
x = df.drop('FullTimeResult', axis=1).astype(int)

In [201]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

In [202]:
Winner_Classification = LogisticRegression(
    multi_class='multinomial', solver='lbfgs', max_iter=1000
)
Winner_Classification.fit(x_train, y_train)



In [203]:
#Confusion matrix & Classification report
y_pred = Winner_Classification.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[259  48 102]
 [116  52 179]
 [ 60  55 465]]
              precision    recall  f1-score   support

           0       0.60      0.63      0.61       409
           1       0.34      0.15      0.21       347
           2       0.62      0.80      0.70       580

    accuracy                           0.58      1336
   macro avg       0.52      0.53      0.51      1336
weighted avg       0.54      0.58      0.55      1336

