In [168]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from datetime import datetime as dt
import itertools
import warnings
warnings.filterwarnings('ignore')

# reading csv files for different seasons
raw_data_1 = pd.read_csv('2018-19.csv')
raw_data_2 = pd.read_csv('2017-18.csv')
raw_data_3 = pd.read_csv('2016-17.csv')
raw_data_4 = pd.read_csv('2015-16.csv')

# Covert NaN to empty strings
raw_data_1 = pd.DataFrame(raw_data_1).fillna('')
raw_data_2 = pd.DataFrame(raw_data_2).fillna('')
raw_data_3 = pd.DataFrame(raw_data_3).fillna('')
raw_data_4 = pd.DataFrame(raw_data_4).fillna('')


# Parse data as time 
def parse_date(date):
    try:
        if date == '':
            return None
        else:
            return dt.strptime(date, '%d/%m/%y').date()
    except:
        return dt.strptime(date, '%d/%m/%Y').date()

raw_data_1.Date = raw_data_1.Date.apply(parse_date)    
raw_data_2.Date = raw_data_2.Date.apply(parse_date)    
raw_data_3.Date = raw_data_3.Date.apply(parse_date)       
raw_data_4.Date = raw_data_4.Date.apply(parse_date)

#Gets all the statistics related to gameplay
                      
columns_req = ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR']

playing_statistics_1 = raw_data_1[columns_req]                      
playing_statistics_2 = raw_data_2[columns_req]
playing_statistics_3 = raw_data_3[columns_req]
playing_statistics_4 = raw_data_4[columns_req]

playing_statistics_4 = playing_statistics_4.dropna()

# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').apply(lambda x: np.mean(x)).index:
        teams[i] = []
    
    # the value corresponding to keys is a list containing the match location.
    for i in range(len(playing_stat.index)):
        HTGS = playing_stat.iloc[i]['FTHG']
        ATGS = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGS)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    try:
        GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(1,39)]).T
        GoalsScored[0] = 0
        # Aggregate to get uptil that point
        for i in range(2,39):
            GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
        return GoalsScored
    except:
        GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(1,34)]).T
        GoalsScored[0] = 0
        # Aggregate to get uptil that point
        for i in range(2,34):
            GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
        return GoalsScored



# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').apply(lambda x: np.mean(x)).index:
        teams[i] = []
        
     # the value corresponding to keys is a list containing the match location.
    for i in range(len(playing_stat.index)):
        
        ATGC = playing_stat.iloc[i]['FTHG']
        HTGC = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
    
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    try:
        GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(1,39)]).T
        GoalsConceded[0] = 0
        # Aggregate to get uptil that point
        for i in range(2,39):
            GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
        return GoalsConceded
    except:
        GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(1,34)]).T
        GoalsConceded[0] = 0
        # Aggregate to get uptil that point
        for i in range(2,34):
            GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
        return GoalsConceded
    

def get_gss(playing_stat):
    GC = get_goals_conceded(playing_stat)
    GS = get_goals_scored(playing_stat)
   
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []

    for i in range(len(playing_stat.index)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j = j + 1
        
    playing_stat['HTGS'] = HTGS
    playing_stat['ATGS'] = ATGS
    playing_stat['HTGC'] = HTGC
    playing_stat['ATGC'] = ATGC
    
    return playing_stat

# Apply to each dataset
playing_statistics_1 = get_gss(playing_statistics_1)
playing_statistics_2 = get_gss(playing_statistics_2)
playing_statistics_3 = get_gss(playing_statistics_3)
playing_statistics_4 = get_gss(playing_statistics_4)

# Get points from a game. As it is a league game, if its a win, its 3 points for the winning team 
# and 0 points for the losing team, if its a draw, its 1 points to each team.
def get_points(result):
    if result == 'W':
        return 3
    elif result == 'D':
        return 1
    else:
        return 0

# accumulates points up for teams from match result which is passed in 
def get_cuml_points(matchres):
    matchres_points = matchres.applymap(get_points)
    try:
        for i in range(2,39):
            matchres_points[i] = matchres_points[i] + matchres_points[i-1]
    except:
        for i in range(2,34):
            matchres_points[i] = matchres_points[i] + matchres_points[i-1]
        
    matchres_points.insert(column =0, loc = 0, value = [0*i for i in range(20)])
    return matchres_points

# accumulates the results of the games
def get_matchres(playing_stat):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').apply(lambda x: np.mean(x)).index:
        teams[i] = []
        
    # the value corresponding to keys is a list containing the match result
    for i in range(len(playing_stat.index)):
        if playing_stat.iloc[i].FTR == 'H':
            teams[playing_stat.iloc[i].HomeTeam].append('W')
            teams[playing_stat.iloc[i].AwayTeam].append('L')
        elif playing_stat.iloc[i].FTR == 'A':
            teams[playing_stat.iloc[i].AwayTeam].append('W')
            teams[playing_stat.iloc[i].HomeTeam].append('L')
        else:
            teams[playing_stat.iloc[i].AwayTeam].append('D')
            teams[playing_stat.iloc[i].HomeTeam].append('D')
    try: 
        return pd.DataFrame(data=teams, index = [i for i in range(1,39)]).T
    except:
        return pd.DataFrame(data=teams, index = [i for i in range(1,34)]).T

# get total points of the season so far for each team
def get_agg_points(playing_stat):
    matchres = get_matchres(playing_stat)
    cum_pts = get_cuml_points(matchres)
    HTP = []
    ATP = []
    j = 0
    for i in range(len(playing_stat.index)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTP.append(cum_pts.loc[ht][j])
        ATP.append(cum_pts.loc[at][j])

        if ((i + 1)% 10) == 0:
            j = j + 1
            
    playing_stat['HTP'] = HTP
    playing_stat['ATP'] = ATP
    return playing_stat
    
# Apply to each dataset
playing_statistics_1 = get_agg_points(playing_statistics_1)
playing_statistics_2 = get_agg_points(playing_statistics_2)
playing_statistics_3 = get_agg_points(playing_statistics_3)
playing_statistics_4 = get_agg_points(playing_statistics_4)

# Gets the form of the teams in the season
def get_form(playing_stat,num):
    form = get_matchres(playing_stat)
    form_final = form.copy()
    try:
        for i in range(num,39):
            form_final[i] = ''
            j = 0
            while j < num:
                form_final[i] += form[i-j]
                j += 1   
    except:
        for i in range(num,34):
            form_final[i] = ''
            j = 0
            while j < num:
                form_final[i] += form[i-j]
                j += 1 
    return form_final

# Adds the results of the past n games to the df
def add_form(playing_stat,num):
    form = get_form(playing_stat,num)
    h = ['M' for i in range(num * 10)]  # since form is not available for n MW (n*10)
    a = ['M' for i in range(num * 10)]
    
    j = num
    for i in range((num*10),len(playing_stat.index)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        
        past = form.loc[ht][j]               # get past n results
        h.append(past[num-1])                    # 0 index is most recent
        
        past = form.loc[at][j]               # get past n results.
        a.append(past[num-1])                   # 0 index is most recent
        
        if ((i + 1)% 10) == 0:
            j = j + 1

    playing_stat['HM' + str(num)] = h                 
    playing_stat['AM' + str(num)] = a

    
    return playing_stat

# Adds the past 5 games of the home and away teams
def add_form_df(playing_statistics):
    playing_statistics = add_form(playing_statistics,1)
    playing_statistics = add_form(playing_statistics,2)
    playing_statistics = add_form(playing_statistics,3)
    playing_statistics = add_form(playing_statistics,4)
    playing_statistics = add_form(playing_statistics,5)
    return playing_statistics    
    
# Make changes to df
playing_statistics_1 = add_form_df(playing_statistics_1)
playing_statistics_2 = add_form_df(playing_statistics_2)
playing_statistics_3 = add_form_df(playing_statistics_3)
playing_statistics_4 = add_form_df(playing_statistics_4)

# Rearranging columns
cols = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HTP', 'ATP', 'HM1', 'HM2', 'HM3',
        'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5' ]

playing_statistics_1 = playing_statistics_1[cols]
playing_statistics_2 = playing_statistics_2[cols]
playing_statistics_3 = playing_statistics_3[cols]
playing_statistics_4 = playing_statistics_4[cols]

# Get Matchweek
def get_mw(playing_stat):
    j = 1
    MatchWeek = []
    for i in range(len(playing_stat.index)):
        MatchWeek.append(j)
        if ((i + 1)% 10) == 0:
            j = j + 1
    playing_stat['MW'] = MatchWeek
    return playing_stat

playing_statistics_1 = get_mw(playing_statistics_1)
playing_statistics_2 = get_mw(playing_statistics_2)
playing_statistics_3 = get_mw(playing_statistics_3)
playing_statistics_4 = get_mw(playing_statistics_4)

# Starting to prepare final dataframe
playing_stat = pd.concat([playing_statistics_1,
                          playing_statistics_2,
                          playing_statistics_3,
                          playing_statistics_4], ignore_index = True)

# Gets the form points.
def get_form_points(string):
    sum = 0
    for letter in string:
        sum += get_points(letter)
    return sum

playing_stat['HTFormPtsStr'] = playing_stat['HM1'] + playing_stat['HM2'] + playing_stat['HM3'] + playing_stat['HM4'] + playing_stat['HM5']
playing_stat['ATFormPtsStr'] = playing_stat['AM1'] + playing_stat['AM2'] + playing_stat['AM3'] + playing_stat['AM4'] + playing_stat['AM5']

playing_stat['HTFormPts'] = playing_stat['HTFormPtsStr'].apply(get_form_points)
playing_stat['ATFormPts'] = playing_stat['ATFormPtsStr'].apply(get_form_points)

# Identify Win/Loss Streaks if any.
def get_3game_ws(string):
    if string[-3:] == 'WWW':
        return 1
    else:
        return 0
    
def get_5game_ws(string):
    if string == 'WWWWW':
        return 1
    else:
        return 0
    
def get_3game_ls(string):
    if string[-3:] == 'LLL':
        return 1
    else:
        return 0
    
def get_5game_ls(string):
    if string == 'LLLLL':
        return 1
    else:
        return 0
    
playing_stat['HTWinStreak3'] = playing_stat['HTFormPtsStr'].apply(get_3game_ws)
playing_stat['HTWinStreak5'] = playing_stat['HTFormPtsStr'].apply(get_5game_ws)
playing_stat['HTLossStreak3'] = playing_stat['HTFormPtsStr'].apply(get_3game_ls)
playing_stat['HTLossStreak5'] = playing_stat['HTFormPtsStr'].apply(get_5game_ls)

# Get Goal Difference
playing_stat['HTGD'] = playing_stat['HTGS'] - playing_stat['HTGC']
playing_stat['ATGD'] = playing_stat['ATGS'] - playing_stat['ATGC']

# Diff in points
playing_stat['DiffPts'] = playing_stat['HTP'] - playing_stat['ATP']
playing_stat['DiffFormPts'] = playing_stat['HTFormPts'] - playing_stat['ATFormPts']

# Scale DiffPts , DiffFormPts, HTGD, ATGD by Matchweek.
cols = ['HTGD','ATGD','DiffPts','DiffFormPts','HTP','ATP']
playing_stat.MW = playing_stat.MW.astype(float)

for col in cols:
    playing_stat[col] = playing_stat[col] / playing_stat.MW

def only_hw(string):
    if string == 'H':
        return 'H'
    else:
        return 'NH'
    
playing_stat['FTR'] = playing_stat.FTR.apply(only_hw)

# final dataset
playing_stat.to_csv("final_dataset_seriaA.csv")







In [169]:
# Read data and drop redundant column.
data = pd.read_csv('final_dataset_seriaA.csv')

# Remove first 3 matchweeks
data = data[data.MW > 3]

data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','DiffPts','HTFormPts','ATFormPts',
           'HM4','HM5','AM4','AM5','HTLossStreak5','HTWinStreak5',
           'HTWinStreak3','HTLossStreak3'],1, inplace=True)

# Total number of matches.
n_matches = data.shape[0]

# Calculate number of features. -1 because we are saving one as the target variable (win/lose/draw)
n_features = data.shape[1] - 1

# Calculate matches won by home team.
n_homewins = len(data[data.FTR == 'H'])

# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100

# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by home team: {}".format(n_homewins))
print("Win rate of home team: {:.2f}%".format(win_rate))

Total number of matches: 1350
Number of features: 11
Number of matches won by home team: 606
Win rate of home team: 44.89%


In [179]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
cols = [['HTGD','ATGD','HTP','ATP']]
for col in cols:
    X_all[col] = scale(X_all[col])
    
#last 3 wins for both sides
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

# Preprocesses the football data and converts catagorical variables into dummy variables
def preprocess_features(X):
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Go through each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collecting the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print(len(X_all.columns))
X_all.to_csv("final.csv")

23


In [176]:
from sklearn.metrics import f1_score
# Split the dataset into training and testing set, the test set is the 2018-19 season which is the current season,
# And I am try to predict a game from the current season. The rest of the data which includes other seasons is my training set.

X_train = X_all[300:]
y_train = y_all[300:]
X_test = X_all[:300]
y_test = y_all[:300]

# Fits a classifier to the training data.
def train_classifier(clf, X_train, y_train):
    clf.fit(X_train, y_train)

# Makes predictions using a fit classifier based on F1 score.
def predict_labels(clf, features, target):
    y_pred = clf.predict(features)
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))

# Train and predict using a classifer based on F1 score. 
def train_predict(clf, X_train, y_train, X_test, y_test):
     
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))
    print(clf)
    
clf_A = LogisticRegression(random_state = 42)
train_predict(clf_A, X_train, y_train, X_test, y_test)
    

Training a LogisticRegression using a training set size of 1050. . .
0.6368715083798883 0.6904761904761905
F1 score and accuracy score for training set: 0.6369 , 0.6905.
F1 score and accuracy score for test set: 0.4587 , 0.6067.
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
