# Machine Learning Model to Predict Outcomes of LaLiga Soccer Matches

In [20]:
import pandas as pd
import numpy as np

## Load in LaLiga data from 2023. 
Resource: fbref.com

In [22]:
fileName="LaLiga-Data-22to23.xlsx"

df = pd.read_excel(fileName)

In [3]:
df = df.dropna(how='all')

## Change MatchResult from Text Field to Int
These are the three categories our model will place our data points into

In [5]:
matchResults = []
resultsDict = {"homeWin":2, "draw":1,'awayWin':0}

for i in range(len(df['Score'])):
    
    scoreString = df.iloc[i,6]
    
    if type(scoreString) == str:
        if int(scoreString[0]) > int(scoreString[2]):
            matchResults.append(resultsDict["homeWin"])
        elif int(scoreString[0]<scoreString[2]):
            matchResults.append(resultsDict["awayWin"])
        else:
            matchResults.append(resultsDict["draw"])
    else:
        matchResults.append(None)
    


df["matchResults"]=matchResults

## Convert Team Names into Position at End of Year
If we are trying to predict the outcome of a current game we will use the predicted final standings

In [7]:
homePos = []
awayPos = []

teamPosDict = {"Barcelona":1, "Real Madrid": 2, "Atlético Madrid": 3, "Real Sociedad":4, "Villarreal":5,
               "Betis":6, "Osasuna":7,"Athletic Club":8,"Mallorca":9, "Girona":10, "Rayo Vallecano":11,
              "Sevilla":12, "Celta Vigo":13, "Cádiz":14, "Getafe":15, "Valencia":16, "Almería":17,
              "Valladolid":18, "Espanyol":19,"Elche":20 }

for i in range(len(df['Home'])):
    if df.iloc[i,4] in teamPosDict:
        homePos.append(teamPosDict[df.iloc[i,4]])
    else:
        homePos.append(None)
    
    if df.iloc[i,8] in teamPosDict:
        awayPos.append(teamPosDict[df.iloc[i,8]])
    else:
        awayPos.append(None)
    
    

df["Home Team Position"] = homePos
df["Away Team Position"] = awayPos

## Define Two Functions to Find Average Goals For and Against Each Team
This will allow us to fill null values and early games with mean data

In [23]:
def findMeanGoalsFor(team):
    totalGoalsFor = 0
 
    totalGames = 0
    for i in range(len(df["Home"])):
        if df.iloc[i,4] == team:
            totalGames += 1
            totalGoalsFor += int(df.iloc[i,6][0])
        elif df.iloc[i,8] == team:
            totalGames += 1
            totalGoalsFor += int(df.iloc[i,6][2])
    if totalGames > 0:
        return totalGoalsFor/totalGames
    else:
        return "ISSUE"

def findMeanGoalsAgainst(team):
    totalGoalsAgainst = 0
    
    totalGames = 0
    for i in range(len(df["Home"])):
        if df.iloc[i,4] == team:
            totalGames += 1
            totalGoalsAgainst += int(df.iloc[i,6][2])
        elif df.iloc[i,8] == team:
            totalGames += 1
            totalGoalsAgainst += int(df.iloc[i,6][0])
    if totalGames > 0:
        return totalGoalsAgainst/totalGames
    else:
        return "ISSUE"

    

## Add in four metrics that reflect how many goals a team has scored and allowed in last three games
Four because we have Home and Away teams

In [24]:
goalsForLast3Home = []
goalsAgainstLast3Home = []
goalsForLast3Away = []
goalsAgainstLast3Away = []

for i in range(len(df["Home"])):
    if df.iloc[i,0]<= 3.0:
        goalsForLast3Home.append(0)
        goalsAgainstLast3Home.append(0)
        goalsForLast3Away.append(0)
        goalsAgainstLast3Away.append(0)
    else:
        homeTeam = df.iloc[i,4]
        awayTeam = df.iloc[i,8]
        
        tempHomeForLast3 = []
        tempAwayForLast3 = []
        
        tempHomeAgainstLast3 = []
        tempAwayAgainstLast3 = []
        
        j = i
        for num in range(len(df["Home"])):
            j -= 1
            if (df.iloc[j,4] == homeTeam and len(tempHomeForLast3) < 3):
                tempHomeForLast3.append(int(df.iloc[j,6][0]))
                tempHomeAgainstLast3.append(int(df.iloc[j,6][2]))
            elif (df.iloc[j,8] == homeTeam and len(tempHomeForLast3) < 3):
                tempHomeForLast3.append(int(df.iloc[j,6][2]))
                tempHomeAgainstLast3.append(int(df.iloc[j,6][0]))
            elif (df.iloc[j,4] == awayTeam and len(tempAwayForLast3)<3):
                tempAwayForLast3.append(int(df.iloc[j,6][0]))
                tempAwayAgainstLast3.append(int(df.iloc[j,6][2]))
            elif (df.iloc[j,4] == awayTeam and len(tempAwayForLast3)<3):
                tempAwayForLast3.append(int(df.iloc[j,6][2]))
                tempAwayAgainstLast3.append(int(df.iloc[j,6][0]))
            elif len(tempHomeForLast3) ==3 and len(tempAwayForLast3) ==3:
                break
        goalsForLast3Home.append(sum(tempHomeForLast3))
        goalsForLast3Away.append(sum(tempAwayForLast3))
        
        goalsAgainstLast3Home.append(sum(tempHomeAgainstLast3))
        goalsAgainstLast3Away.append(sum(tempAwayAgainstLast3))

df["Goals FOR L3 (Home)"] = goalsForLast3Home
df["Goals FOR L3 (Away)"] = goalsForLast3Away

df["Goals AGAINST L3 (Home)"] = goalsAgainstLast3Home
df["Goals AGAINST L3 (Away)"] = goalsAgainstLast3Away
        

## Replace null values with mean goals

In [10]:
for i in range(len(df["Home"])):
    if (df.iloc[i,17] + df.iloc[i,18] + df.iloc[i,19]+df.iloc[i,20]) == 0 and df.iloc[i,4] != "nan":
        df.iloc[i,17] = findMeanGoalsFor(df.iloc[i,4])
        df.iloc[i,18] = findMeanGoalsFor(df.iloc[i,8])
        df.iloc[i,19] = findMeanGoalsAgainst(df.iloc[i,4])
        df.iloc[i,20] = findMeanGoalsAgainst(df.iloc[i,8])

## Replace each day with number code

In [25]:
dayDict = {"Fri":5,"Thu":4,"Wed":3,"Tue":2,"Mon":1,"Sat":6,"Sun":7}

for i in range(len(df["Day"])):
    if df.iloc[i,1] in dayDict:
        df.iloc[i,1] = dayDict[df.iloc[i,1]]

## Remove any data containing null values

In [12]:
df = df.drop(["Match Report", "Notes","Venue","Referee","Date","Time"],axis=1)

df = df.dropna(how="any")

df = df.reset_index(drop=True)

In [13]:
pd.set_option("display.max_columns",None)
df.head()

Unnamed: 0,WK,Day,Home,xG,Score,xG.1,Away,Attendance,matchResults,Home Team Position,Away Team Position,Goals FOR L3 (Home),Goals FOR L3 (Away),Goals AGAINST L3 (Home),Goals AGAINST L3 (Away)
0,1.0,5,Osasuna,1.5,2–1,0.9,Sevilla,18536.0,2.0,7.0,12.0,0.973684,1.236842,1.105263,1.421053
1,1.0,6,Celta Vigo,0.4,2–2,1.1,Espanyol,13859.0,1.0,13.0,19.0,1.131579,1.368421,1.394737,1.815789
2,1.0,6,Valladolid,1.0,0–3,1.5,Villarreal,17543.0,0.0,18.0,5.0,0.868421,1.552632,1.657895,1.052632
3,1.0,6,Barcelona,1.9,0–0,0.5,Rayo Vallecano,81104.0,1.0,1.0,11.0,1.842105,1.184211,0.526316,1.394737
4,1.0,7,Cádiz,0.2,0–1,1.5,Real Sociedad,16570.0,0.0,14.0,4.0,0.789474,1.342105,1.394737,0.921053


## Instead of score, we will use home goals and away goals as metrics

In [14]:
homeGoals = []
awayGoals = []

for i in range(len(df["Score"])):
    homeGoals.append(int(df.iloc[i,4][0]))
    awayGoals.append(int(df.iloc[i,4][2]))


df["Home Goals"] = homeGoals
df["Away Goals"] = awayGoals

## Define features and target

In [15]:
target = df["matchResults"]

notFeatures = ["matchResults","Home","Away","Score","Home Goals","Away Goals","xG","xG.1"]
features = df.drop(notFeatures, axis=1)
        

In [16]:
from sklearn.model_selection import train_test_split

## Separate data into training and testing
Create and fit decision tree object

In [154]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [155]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)

dtc.score(X_test,y_test)

0.52

In [156]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

knn.score(X_test, y_test)

0.41333333333333333

In [160]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

train_scaler = preprocessing.StandardScaler().fit(X_train)
test_scaler = preprocessing.StandardScaler().fit(X_test)

X_trainScaled = train_scaler.transform(X_train)
X_testScaled = test_scaler.transform(X_test)

lrg = LogisticRegression()
lrg.fit(X_trainScaled,y_train)

print(y_test)
print(lrg.predict(X_testScaled))

lrg.score(X_testScaled, y_test)

258    1.0
19     2.0
250    2.0
200    0.0
168    2.0
      ... 
47     2.0
335    2.0
127    0.0
248    2.0
261    2.0
Name: matchResults, Length: 75, dtype: float64
[2. 2. 2. 0. 1. 2. 1. 0. 2. 0. 2. 2. 0. 2. 0. 0. 2. 2. 1. 2. 2. 0. 2. 0.
 2. 2. 0. 2. 0. 2. 0. 2. 1. 1. 2. 0. 2. 2. 0. 0. 1. 2. 2. 2. 2. 2. 2. 2.
 0. 2. 0. 1. 0. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 0. 2. 2. 0. 0. 2. 0. 2.
 0. 2. 2.]


0.6133333333333333