<a href="https://colab.research.google.com/github/reedington/Premier-League-Predictions/blob/main/Premier_League.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1.0.0 Web Scraping the **PREMIER LEAGUE** data from: 
https://fbref.com using

1.   Requests
2.   Pandas
3.   BeautifulSoup(Mainly for the scraping of data from internet)
4.   Time 



In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import numpy as np

In [2]:
standings_url='https://fbref.com/en/comps/9/Premier-League-Stats'
years=list(range(2022,1999,-1))
all_matches=[]

IndexError: ignored

In [15]:
for year in years:
    data=requests.get(standings_url)
    soup=BeautifulSoup(data.text)
    standings_table=soup.select('table.stats_table')[0]
    teams_url=[standings.get('href') for standings in standings_table.find_all('a')]
    squad_links=[links for links in teams_url if '/squads/' in links]
    squad_links=[f'https://fbref.com{links}' for links in squad_links]
    previous_seasons=soup.select('a.prev')[0].get('href')
    standings_url=f'https://fbref.com{previous_seasons}'
    for squad in squad_links:
        squad_name= squad.split('/')[-1].replace('-Stats','').replace('-',' ')
        data=requests.get(squad)
        soup=BeautifulSoup(data.text)
        matches=pd.read_html(data.text,match='Scores & Fixtures')[0]
        links=[link.get('href') for link in soup.find_all('a')]
        links_updated=[value for value in links if value!=None]
        #Shooting Dataframe
        shooting_url=[link for link in links_updated if 'all_comps/shooting' in link]
        shooting_url=f'https://fbref.com{shooting_url[0]}'
        shooting_df=pd.read_html(shooting_url,match='Shooting')[0]
        shooting_df.columns=shooting_df.columns.droplevel()
        try:
            team_data=matches.merge(shooting_df[['Date','Sh','SoT','Dist','PK','PKatt']])
        except ValueError:
            continue
        team_data['Name']=squad_name
        team_data['Season']=year
        all_matches.append(team_data)
        time.sleep(1)

IndexError: ignored

In [None]:
match_df=pd.concat(all_matches,ignore_index=True)
match_df.to_csv('matches.csv') 

# 1.0.1 **Premier League Predictions**

Training and Fitting the Model to be able to predict either the teams Win(W),
Lose(L) or 
Draw(D) 

Importing the necessary **libraries** used for training,testing and predicting our data:

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
from sklearn.metrics import accuracy_score,precision_score,f1_score,confusion_matrix,classification_report
from sklearn.model_selection import RandomizedSearchCV


In [None]:
matches=pd.read_csv('matches.csv')
matches.drop('Unnamed: 0',axis=1,inplace=True)
matches=matches[matches['Comp']=='Premier League']

In [None]:
matches['Date']=matches['Date'].apply(pd.to_datetime)

In [None]:
#Manipulating the datasets
matches['venue_codes']=matches['Venue'].astype('category').cat.codes
matches['Opp_codes']=matches['Opponent'].astype('category').cat.codes
matches['team_codes']=matches['Name'].astype('category').cat.codes
matches['Name']=matches['Name'].replace('Stats',' ',regex=True)
matches['Result_codes']=matches['Result'].astype('category').cat.codes
matches['Day_codes']=matches['Date'].dt.day_of_week

matches['GF']=matches['GF'].replace('\((\d+)\)','',regex=True).astype(float)
matches['GF']=matches['GF'].astype(int)
matches['GA']=matches['GA'].replace('\((\d+)\)','',regex=True).astype(float)
matches['GA']=matches['GA'].astype(int)

#filling out missing data in each column used for training the data
matches['xG']=matches['xG'].fillna(np.mean(matches['xG']))
matches['SoT']=matches['SoT'].fillna(np.mean(matches['SoT']))
matches['xGA']=matches['xGA'].fillna(np.mean(matches['xGA']))
matches['Sh']=matches['Sh'].fillna(np.mean(matches['Sh']))
matches['Dist']=matches['Dist'].fillna(np.mean(matches['Dist']))
matches['PKatt']=matches['PKatt'].fillna(np.mean(matches['PKatt']))
matches['Attendance']=matches['Attendance'].fillna(np.mean(matches['Attendance']))
matches['Dist']=matches['Dist'].fillna(np.mean(matches['Dist']))
matches['Poss']=matches['Poss'].fillna(np.mean(matches['Poss']))



In [None]:
#The variables/features to train it on
#The predictions are the features we are training our model on to be able to predict our target varible 
predictions=['venue_codes','Opp_codes','team_codes','Day_codes','xG','Sh','SoT','xGA','Dist','PK','PKatt','Attendance','Poss']
target=['Result_codes']
#spiliting our data into train and test
train=matches[matches['Date']<'2022-01-01']
test=matches[matches['Date']>'2022-01-01']
# X is the Feature to train it on
X_train=train[predictions]
X_test=test[predictions]
# y is target varible
y_train=train[target]
y_test=test[target]

In [None]:
#Determining which model is best 
model={'Linear Regression': LinearRegression(),
      'K-Nearest-Neighbors': KNeighborsClassifier(),
      'Random-Forest-Classifier':RandomForestClassifier()}
def fit_and_score(model,X_train,X_test,y_train,y_test):
    '''
    To determine the accuracy between the 3 model for classifier
    Models: a dict of different Scitkit-Learn machine learning models
    X_train:training data(no labels)
    X_test: testing data (no labels)
    y_train: training labels
    y_test: test labels
    
    '''
    model_score={}
    np.random.seed(42)
    for name,estimators in model.items():
        
        OneVsRestClassifier(estimators.fit(X_train,np.ravel(y_train)))
        model_score[name]=OneVsRestClassifier(estimators.score(X_test,np.ravel(y_test)))
    return model_score

In [None]:
fit_and_score(model,X_train,X_test,y_train,y_test)

In [None]:
grid={'n_estimators':[100,200,500,1000,1200],
      'max_depth':[None,5,10,20,50,100],
      'max_features':['auto','sqrt','log2'],
      'min_samples_split':[2,4,6,8,10],
      'min_samples_leaf':[1,2,4],
      'max_leaf_nodes':[None,10,30,40,80,150],
      'max_samples':[None,50,60,85,120]}
np.random.seed(42)
clf=RandomForestClassifier(n_jobs=1)
rs_clf=RandomizedSearchCV(estimator=clf,
                         param_distributions=grid,
                         n_iter=10,
                         cv=5,
                         verbose=2)

In [None]:
OneVsRestClassifier(rs_clf.fit(X_train,np.ravel(y_train)))
print(f'The accuracy score is {rs_clf.score(X_test,np.ravel(y_test))*100:.2f}%')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
y_preds=rs_clf.predict(X_test)
def plot_conf_mat(y_test, y_preds):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(np.ravel(y_test), y_preds),
                     annot=True, # Annotate the boxes
                     cbar=False)
    plt.xlabel("Predicted label") # predictions go on the x-axis
    plt.ylabel("True label") # true labels go on the y-axis 
    
plot_conf_mat(y_test, y_preds)

In [None]:
from google.colab import drive
drive.mount('/content/drive')