In [None]:
import math
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [None]:
df1 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2122/F1.csv')
df2 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2223/F1.csv')

df = pd.concat([df1,df2])
df = df[['Date','HomeTeam','AwayTeam','FTHG','FTAG','B365H','B365D','B365A']]
df.columns = ['Date','Home','Away','FT_Goals_H','FT_Goals_A','FT_Odd_H','FT_Odd_D','FT_Odd_A']

df[['Dia','Mes','Ano']] = df['Date'].str.split('/',expand=True)
df = df.drop(['Date'], axis=1)
df['Date'] = df['Ano']+'-'+df['Mes']+'-'+ df['Dia']
df = df.drop(columns=['Dia','Mes','Ano'], axis=1)
df = df[['Date','Home','Away','FT_Goals_H','FT_Goals_A','FT_Odd_H','FT_Odd_D','FT_Odd_A']]

df.reset_index(inplace=True, drop=True)
df.index = df.index.set_names(['Nº'])
df = df.rename(index=lambda x: x + 1)

# df.to_excel('Base.xlsx')

In [None]:
total_count = len(df.index)

winHPLValues = 100 * df.FT_Odd_H - 100
winDPLValues = 100 * df.FT_Odd_D - 100
winAPLValues = 100 * df.FT_Odd_A - 100
losePLValues = -100

df.loc[((df['FT_Goals_H']) > (df['FT_Goals_A'])), 'Result_FT'] = 'H'
df.loc[((df['FT_Goals_H']) == (df['FT_Goals_A'])), 'Result_FT'] = 'D'
df.loc[((df['FT_Goals_H']) < (df['FT_Goals_A'])), 'Result_FT'] = 'A'

df['H'] = winHPLValues.where(df.Result_FT == 'H', other=losePLValues)
df['D'] = winDPLValues.where(df.Result_FT == 'D', other=losePLValues)
df['A'] = winAPLValues.where(df.Result_FT == 'A', other=losePLValues)

no_of_days = 0

matchDates = df.Date.unique()

if no_of_days > 0:
    matchDates = (matchDates[-no_of_days:])
    
df2 = pd.DataFrame()

rowsDate = []
rowsH = []
rowsD = []
rowsA = []

count = 0
for mDate in matchDates:
    count += 1
    rowsDate.append(mDate)
    rowsH.append(df.loc[df['Date'] == mDate]['H'].sum())
    rowsD.append(df.loc[df['Date'] == mDate]['D'].sum())
    rowsA.append(df.loc[df['Date'] == mDate]['A'].sum())

df2['Date'] = rowsDate
df2['H'] = rowsH
df2['D'] = rowsD
df2['A'] = rowsA

df2 = df2.tail(101)
df2 = df2.reset_index(drop=True)
df2['Id'] = df2.reset_index()['index'].rename('index_copy')
df2['Id'] = df2['Id'] + 1
df2 = df2[['Id','Date','H','D','A']]

df2['Hacu'] = df2.H.cumsum()
df2['Dacu'] = df2.D.cumsum()
df2['Aacu'] = df2.A.cumsum()

df2['Hacu'].loc[0] = np.nan
df2['Dacu'].loc[0] = np.nan
df2['Aacu'].loc[0] = np.nan

def weighted_mean_H(s):
    d = df2.loc[s.index, 'Hacu']
    w = df2.loc[s.index, 'Id']
    return (d * w).sum() / w.sum()

def weighted_mean_H_C(s):
    d = df2.loc[s.index, 'waHC']
    w = df2.loc[s.index, 'Id']
    return (d * w).sum() / w.sum()

def weighted_mean_D(s):
    d = df2.loc[s.index, 'Dacu']
    w = df2.loc[s.index, 'Id']
    return (d * w).sum() / w.sum()

def weighted_mean_D_C(s):
    d = df2.loc[s.index, 'waDC']
    w = df2.loc[s.index, 'Id']
    return (d * w).sum() / w.sum()

def weighted_mean_A(s):
    d = df2.loc[s.index, 'Aacu']
    w = df2.loc[s.index, 'Id']
    return (d * w).sum() / w.sum()

def weighted_mean_A_C(s):
    d = df2.loc[s.index, 'waAC']
    w = df2.loc[s.index, 'Id']
    return (d * w).sum() / w.sum()

df2['waH16'] = df2.rolling(16)['Hacu'].apply(weighted_mean_H, raw=False)
df2['waH8'] = df2.rolling(8)['Hacu'].apply(weighted_mean_H, raw=False)
df2['waHC'] = 2*df2.waH8-df2.waH16
df2['waH4'] = df2.rolling(4)['waHC'].apply(weighted_mean_H_C, raw=False)

df2['waD16'] = df2.rolling(16)['Dacu'].apply(weighted_mean_D, raw=False)
df2['waD8'] = df2.rolling(8)['Dacu'].apply(weighted_mean_D, raw=False)
df2['waDC'] = 2*df2.waD8-df2.waD16
df2['waD4'] = df2.rolling(4)['waDC'].apply(weighted_mean_D_C, raw=False)

df2['waA16'] = df2.rolling(16)['Aacu'].apply(weighted_mean_A, raw=False)
df2['waA8'] = df2.rolling(8)['Aacu'].apply(weighted_mean_A, raw=False)
df2['waAC'] = 2*df2.waA8-df2.waA16
df2['waA4'] = df2.rolling(4)['waAC'].apply(weighted_mean_A_C, raw=False)

df2['Hhull'] = df2['waH4']
df2['Dhull'] = df2['waD4']
df2['Ahull'] = df2['waA4']

df2['Hdist'] = (df2.Hacu / df2.Hhull)
df2['Ddist'] = (df2.Dacu / df2.Dhull)
df2['Adist'] = (df2.Aacu / df2.Ahull)

def r_function(s):
    f = s.loc[s.index.values[0]]
    l = s.loc[s.index.values[1]]
    return (l -f)/abs(f)

df2['Hr'] = df2['Hhull'].rolling(2).apply(r_function, raw=False)
df2['Dr'] = df2['Dhull'].rolling(2).apply(r_function, raw=False)
df2['Ar'] = df2['Ahull'].rolling(2).apply(r_function, raw=False)

def inc_H(s):
    x = df2.loc[s.index, 'Id']
    y = df2.loc[s.index, 'Hhull']
    slope, intercept = np.polyfit(x, y , 1)
    return slope

def inc_D(s):
    x = df2.loc[s.index, 'Id']
    y = df2.loc[s.index, 'Dhull']
    slope, intercept = np.polyfit(x, y, 1)
    return slope

def inc_A(s):
    x = df2.loc[s.index, 'Id']
    y = df2.loc[s.index, 'Ahull']
    slope, intercept = np.polyfit(x, y, 1)
    return slope

df2['Hinc'] = df2['Hhull'].rolling(5).apply(inc_H, raw=False)
df2['Dinc'] = df2['Dhull'].rolling(5).apply(inc_D, raw=False)
df2['Ainc'] = df2['Ahull'].rolling(5).apply(inc_A, raw=False)

df2['Hdp'] = df2['Hacu'].rolling(10).std()
df2['Ddp'] = df2['Dacu'].rolling(10).std()
df2['Adp'] = df2['Aacu'].rolling(10).std()

df2['Hamp'] = df2['Hacu'].rolling(10).max() / df2['Hacu'].rolling(10).min()
df2['Damp'] = df2['Dacu'].rolling(10).max() / df2['Dacu'].rolling(10).min()
df2['Aamp'] = df2['Aacu'].rolling(10).max() / df2['Aacu'].rolling(10).min()

matchDates2 = df2.Date.unique()

df3 = pd.DataFrame()

def normaliz(dfS):
    actual_value = (dfS.loc[dfS.index.values[4]])
    try:
        n = (actual_value - dfS.min()) / (dfS.max() - dfS.min())
        if math.isnan(n):
            return 0
    except ZeroDivisionError:
        return 0
    return n

df3['Id'] = df2['Id'].iloc[23:]

df3['Hhull'] = df2['Hhull'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Dhull'] = df2['Dhull'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Ahull'] = df2['Ahull'].iloc[23:].rolling(5).apply(normaliz, raw=False)

df3['Hdist'] = df2['Hdist'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Ddist'] = df2['Ddist'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Adist'] = df2['Adist'].iloc[23:].rolling(5).apply(normaliz, raw=False)

df3['Hr'] = df2['Hr'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Dr'] = df2['Dr'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Ar'] = df2['Ar'].iloc[23:].rolling(5).apply(normaliz, raw=False)

df3['Hinc'] = df2['Hinc'].rolling(5).apply(normaliz, raw=False)
df3['Dinc'] = df2['Dinc'].rolling(5).apply(normaliz, raw=False)
df3['Ainc'] = df2['Ainc'].rolling(5).apply(normaliz, raw=False)

df3['Hdp'] = df2['Hdp'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Ddp'] = df2['Ddp'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Adp'] = df2['Adp'].iloc[23:].rolling(5).apply(normaliz, raw=False)

df3['Hamp'] = df2['Hamp'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Damp'] = df2['Damp'].iloc[23:].rolling(5).apply(normaliz, raw=False)
df3['Aamp'] = df2['Aamp'].iloc[23:].rolling(5).apply(normaliz, raw=False)

df3['R'] = ''

for index, row in df2.iterrows():
    if index > 26:
        try:
            h = df2.iloc[(index+1)].H
            d = df2.iloc[(index+1)].D
            a = df2.iloc[(index+1)].A
            if h > d and h > a:
                df3.loc[index, 'R'] = 'H'
            elif d > h and d > a:
                df3.loc[index, 'R'] = 'D'
            else:
                df3.loc[index, 'R'] = 'A'
        except LookupError:
            pass

selected_date = df3.iloc[-1]

distance_columns = ['Hhull', 'Dhull', 'Ahull', 'Hdist', 'Ddist', 'Adist', 'Hr', 'Dr', 'Ar', 'Hinc', 'Dinc', 'Ainc', 'Hdp',
                    'Ddp', 'Adp', 'Hamp', 'Damp', 'Aamp']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_date[k]) ** 2
    return math.sqrt(inner_value)

def euclidean_distance_tr(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += abs(row[k] - selected_date[k])
    return inner_value

df3['eucli'] = df3.apply(euclidean_distance_tr, axis=1)

df4 = pd.DataFrame()
df4['R'] = df3['R']
df4['eucli'] = df3['eucli']
df4.sort_values(by=['eucli'], inplace=True)

df5 = df4[df4.eucli != 0].nsmallest(3, 'eucli')
df5.reset_index(inplace=True, drop=True)
df5.index = df5.index.set_names(['N'])
df5 = df5.rename(index=lambda x: x + 1)

In [None]:
print('Previsão utilizando o KNN para a Liga selecionada.');print('');
print(df5);print('');
print('Período analisado:', str(matchDates2[0]),' até ', str(matchDates2[-1]), '('+ str(len(matchDates2)) + ' dias)')

Previsão utilizando o KNN para a Liga selecionada.

   R     eucli
N             
1  D  1.772459
2  H  2.969830
3  H  3.778049

Período analisado: 2022-03-20  até  2023-04-09 (101 dias)
