In [100]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from google.colab import files

import joblib

In [101]:
df = pd.read_csv("Match_winner.csv")
df.head()

Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards
0,2000/01,19-08-2000,Charlton,Man City,4,0,H,2,0,H,...,14,4,6,6,13,12,1,2,0,0
1,2000/01,19-08-2000,Chelsea,West Ham,4,2,H,1,0,H,...,10,5,7,7,19,14,1,2,0,0
2,2000/01,19-08-2000,Coventry,Middlesbrough,1,3,A,1,1,D,...,3,9,8,4,15,21,5,3,1,0
3,2000/01,19-08-2000,Derby,Southampton,2,2,D,1,2,A,...,4,6,5,8,11,13,1,1,0,0
4,2000/01,19-08-2000,Leeds,Everton,2,0,H,2,0,H,...,8,6,6,4,21,20,1,3,0,0


In [102]:
df.columns

Index(['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals',
       'FullTimeAwayGoals', 'FullTimeResult', 'HalfTimeHomeGoals',
       'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots',
       'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners',
       'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards',
       'HomeRedCards', 'AwayRedCards'],
      dtype='object')

In [103]:
df.describe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9380 entries, 0 to 9379
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Season             9380 non-null   object
 1   MatchDate          9380 non-null   object
 2   HomeTeam           9380 non-null   object
 3   AwayTeam           9380 non-null   object
 4   FullTimeHomeGoals  9380 non-null   int64 
 5   FullTimeAwayGoals  9380 non-null   int64 
 6   FullTimeResult     9380 non-null   object
 7   HalfTimeHomeGoals  9380 non-null   int64 
 8   HalfTimeAwayGoals  9380 non-null   int64 
 9   HalfTimeResult     9380 non-null   object
 10  HomeShots          9380 non-null   int64 
 11  AwayShots          9380 non-null   int64 
 12  HomeShotsOnTarget  9380 non-null   int64 
 13  AwayShotsOnTarget  9380 non-null   int64 
 14  HomeCorners        9380 non-null   int64 
 15  AwayCorners        9380 non-null   int64 
 16  HomeFouls          9380 non-null   int64 


In [104]:
# Dropping unnecessary columns that are not required for the analysis
df=df.drop(['Season', 'MatchDate','FullTimeHomeGoals','FullTimeAwayGoals','HalfTimeResult'],axis=1)

In [105]:
# Print the remaining column names
print(df.columns)

Index(['HomeTeam', 'AwayTeam', 'FullTimeResult', 'HalfTimeHomeGoals',
       'HalfTimeAwayGoals', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget',
       'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners', 'HomeFouls',
       'AwayFouls', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards',
       'AwayRedCards'],
      dtype='object')


In [106]:
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HomeShots,AwayShots,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards
0,Charlton,Man City,H,2,0,17,8,14,4,6,6,13,12,1,2,0,0
1,Chelsea,West Ham,H,1,0,17,12,10,5,7,7,19,14,1,2,0,0
2,Coventry,Middlesbrough,A,1,1,6,16,3,9,8,4,15,21,5,3,1,0
3,Derby,Southampton,D,1,2,6,13,4,6,5,8,11,13,1,1,0,0
4,Leeds,Everton,H,2,0,17,12,8,6,6,4,21,20,1,3,0,0


In [107]:
#Check again for missing values and duplicate values
print(df.isnull().sum())
print(df.duplicated().sum())

HomeTeam             0
AwayTeam             0
FullTimeResult       0
HalfTimeHomeGoals    0
HalfTimeAwayGoals    0
HomeShots            0
AwayShots            0
HomeShotsOnTarget    0
AwayShotsOnTarget    0
HomeCorners          0
AwayCorners          0
HomeFouls            0
AwayFouls            0
HomeYellowCards      0
AwayYellowCards      0
HomeRedCards         0
AwayRedCards         0
dtype: int64
0


In [108]:
df = df.drop_duplicates()

In [109]:
print(df.duplicated().sum())

0


In [110]:
#One hot encoding
df= pd.get_dummies(df, columns=['HomeTeam', 'AwayTeam',], prefix=['Home', 'Away'])

In [111]:
df.head()

Unnamed: 0,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HomeShots,AwayShots,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,...,Away_Southampton,Away_Stoke,Away_Sunderland,Away_Swansea,Away_Tottenham,Away_Watford,Away_West Brom,Away_West Ham,Away_Wigan,Away_Wolves
0,H,2,0,17,8,14,4,6,6,13,...,False,False,False,False,False,False,False,False,False,False
1,H,1,0,17,12,10,5,7,7,19,...,False,False,False,False,False,False,False,True,False,False
2,A,1,1,6,16,3,9,8,4,15,...,False,False,False,False,False,False,False,False,False,False
3,D,1,2,6,13,4,6,5,8,11,...,True,False,False,False,False,False,False,False,False,False
4,H,2,0,17,12,8,6,6,4,21,...,False,False,False,False,False,False,False,False,False,False


In [112]:
#Label_Encoding
le = LabelEncoder()
for col in ['FullTimeResult']:
    df[col] = le.fit_transform(df[col])


In [113]:
#display dataset after applying label encoding
df.head()

Unnamed: 0,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HomeShots,AwayShots,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,...,Away_Southampton,Away_Stoke,Away_Sunderland,Away_Swansea,Away_Tottenham,Away_Watford,Away_West Brom,Away_West Ham,Away_Wigan,Away_Wolves
0,2,2,0,17,8,14,4,6,6,13,...,False,False,False,False,False,False,False,False,False,False
1,2,1,0,17,12,10,5,7,7,19,...,False,False,False,False,False,False,False,True,False,False
2,0,1,1,6,16,3,9,8,4,15,...,False,False,False,False,False,False,False,False,False,False
3,1,1,2,6,13,4,6,5,8,11,...,True,False,False,False,False,False,False,False,False,False
4,2,2,0,17,12,8,6,6,4,21,...,False,False,False,False,False,False,False,False,False,False


###-->I considered both encoding because with label encoding the model cannont predict based history between teams but now by applying onehot encoding on home team and away team the model can predict based head to head results.**

In [114]:
#Checking correlation
corr = df.corr(numeric_only=True)
print(corr)

                   FullTimeResult  HalfTimeHomeGoals  HalfTimeAwayGoals  \
FullTimeResult           1.000000           0.432693          -0.422488   
HalfTimeHomeGoals        0.432693           1.000000          -0.049855   
HalfTimeAwayGoals       -0.422488          -0.049855           1.000000   
HomeShots                0.224808           0.120792          -0.047903   
AwayShots               -0.257528          -0.036827           0.174038   
...                           ...                ...                ...   
Away_Watford             0.035214           0.018854          -0.015936   
Away_West Brom           0.039997           0.025114          -0.040995   
Away_West Ham            0.024966           0.015304          -0.010427   
Away_Wigan               0.021978          -0.008714          -0.018203   
Away_Wolves              0.020800           0.019353          -0.022620   

                   HomeShots  AwayShots  HomeShotsOnTarget  AwayShotsOnTarget  \
FullTimeResult    

In [115]:
target_corr = corr["FullTimeResult"].sort_values(ascending=False)
print("Correlation with FullTimeResult:\n", target_corr)

Correlation with FullTimeResult:
 FullTimeResult       1.000000
HalfTimeHomeGoals    0.432693
HomeShotsOnTarget    0.323645
HomeShots            0.224808
Home_Liverpool       0.103938
                       ...   
HomeYellowCards     -0.118126
HomeRedCards        -0.130704
AwayShots           -0.257528
AwayShotsOnTarget   -0.321245
HalfTimeAwayGoals   -0.422488
Name: FullTimeResult, Length: 107, dtype: float64


In [116]:
df

Unnamed: 0,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HomeShots,AwayShots,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,...,Away_Southampton,Away_Stoke,Away_Sunderland,Away_Swansea,Away_Tottenham,Away_Watford,Away_West Brom,Away_West Ham,Away_Wigan,Away_Wolves
0,2,2,0,17,8,14,4,6,6,13,...,False,False,False,False,False,False,False,False,False,False
1,2,1,0,17,12,10,5,7,7,19,...,False,False,False,False,False,False,False,True,False,False
2,0,1,1,6,16,3,9,8,4,15,...,False,False,False,False,False,False,False,False,False,False
3,1,1,2,6,13,4,6,5,8,11,...,True,False,False,False,False,False,False,False,False,False
4,2,2,0,17,12,8,6,6,4,21,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9375,2,2,1,12,14,6,5,7,4,8,...,False,False,False,False,False,False,False,False,False,False
9376,1,1,0,5,13,2,5,1,4,15,...,False,False,False,False,False,False,False,False,False,False
9377,1,1,1,11,7,2,2,1,3,18,...,False,False,False,False,True,False,False,False,False,False
9378,2,1,0,17,11,7,2,3,6,10,...,False,False,False,False,False,False,False,False,False,False


In [117]:
df.columns

Index(['FullTimeResult', 'HalfTimeHomeGoals', 'HalfTimeAwayGoals', 'HomeShots',
       'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners',
       'AwayCorners', 'HomeFouls',
       ...
       'Away_Southampton', 'Away_Stoke', 'Away_Sunderland', 'Away_Swansea',
       'Away_Tottenham', 'Away_Watford', 'Away_West Brom', 'Away_West Ham',
       'Away_Wigan', 'Away_Wolves'],
      dtype='object', length=107)

In [118]:
y = df['FullTimeResult']
X = df.drop('FullTimeResult', axis=1).astype(int)

In [119]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [120]:
# Compute class counts
from collections import Counter

counter = Counter(y_train)
total = sum(counter.values())
# Assign higher weight to minority class
weights = {
    0: total / (3 * counter[0]),  # Home
    1: total / (2.5 * counter[1]),  # Away
    2: total / (7 * counter[2])   # Draw
}

print("Class Weights:", weights)

log_reg = LogisticRegression(
    max_iter=500,
    penalty='l1',
    solver='liblinear',
    C=0.1,
    class_weight=weights
)

log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)


Class Weights: {0: 1.1221773590548827, 1: 1.6321914083741165, 2: 0.3119906868451688}


In [121]:
print(pd.Series(y_pred).value_counts())


2    634
0    628
1    614
Name: count, dtype: int64


In [122]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6321961620469083

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.75      0.70       539
           1       0.40      0.52      0.45       474
           2       0.84      0.62      0.71       863

    accuracy                           0.63      1876
   macro avg       0.63      0.63      0.62      1876
weighted avg       0.67      0.63      0.64      1876


Confusion Matrix:
 [[406 114  19]
 [145 247  82]
 [ 77 253 533]]


In [123]:
joblib.dump(log_reg, "logistic_regression_model.pkl")
files.download("logistic_regression_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>