### Construcción de un Dataset de la Liga Méxicana de Fútbol
#### a partir de la información de Wikipedia
https://en.wikipedia.org/wiki/2021–22_Liga_MX_season

Instrucciones:
1. De la página de wikipedia busque la tabla de resultados
2. Copie y pegue en una Hoja de Cálculo de Google Drive
3. Descargue en formato .CSV 
4. Renombre el archivo con las siguientes características FMF_T(A|C)_(año4-año2).csv.  Ejemplo: FMF_TA_2021-22.csv
5. Copie el archivo en la carpeta /ligas/data (importante cree las carpetas antes de este paso)

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
DATA_DIR = os.path.join(os.getcwd(), 'data/')
CHART_DIR = os.path.join(os.getcwd(), 'charts/')

In [5]:
DATA_DIR

'/Users/rubenrodriguez/Documents/anaconda/premiereLeague/data/'

In [9]:
#data_file = './data/FMF_TA_2021.csv'
data_file = './data/lmf-ac-2021-22.csv'
df = pd.read_csv(data_file, index_col=0)
df.head()

Unnamed: 0_level_0,AMÉ,ATL,ASL,CAZ,GUA,JUÁ,LEÓ,MAZ,MON,NEC,PAC,PUE,QUE,SAN,TIJ,TOL,UNL,UNM
Home \ Away,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
América,—,0–2,2–3,0–0,0–0,3–0,2–0,2–0,0–0,2–1,1–3,2–0,1–1,2–1,2–0,3–0,1–0,2–0
Atlas,0–1,—,1–0,0–0,1–1,2–0,2–0,1–2,2–1,2–1,0–1,0–1,2–0,2–1,0–2,0–0,1–1,0–0
Atlético San Luis,0–1,2–6,—,0–0,2–2,0–1,2–0,1–0,1–1,0–2,0–2,2–1,1–1,1–3,4–1,0–1,0–3,2–0
Cruz Azul,2–1,1–0,0–1,—,0–1,1–0,0–1,0–2,1–1,1–2,1–1,1–3,2–0,1–2,2–0,4–0,1–1,2–1
Guadalajara,0–0,0–1,1–2,1–1,—,2–2,0–3,3–0,1–3,2–1,1–0,2–3,1–1,1–0,2–1,2–0,1–3,3–1


In [10]:
print(df.columns)

Index(['AMÉ', 'ATL', 'ASL', 'CAZ', 'GUA', 'JUÁ', 'LEÓ', 'MAZ', 'MON', 'NEC',
       'PAC', 'PUE', 'QUE', 'SAN', 'TIJ', 'TOL', 'UNL', 'UNM'],
      dtype='object')


In [11]:
df.index = df.columns
rows = []
for i in df.index:
    for c in df.columns:
        if i == c: continue
        score = df.loc[i, c]
        if score == '—': continue
        ssplit = score.split('–')
        #print(ssplit[1])
        rows.append([i, c, ssplit[0], ssplit[1]])
df = pd.DataFrame(rows, columns = ['home', 'away', 'home_score', 'away_score'])
df.head()

Unnamed: 0,home,away,home_score,away_score
0,AMÉ,ATL,0,2
1,AMÉ,ASL,2,3
2,AMÉ,CAZ,0,0
3,AMÉ,GUA,0,0
4,AMÉ,JUÁ,3,0


In [12]:
teams = df.home.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
teams.head()

Unnamed: 0,team,i
0,AMÉ,0
1,ATL,1
2,ASL,2
3,CAZ,3
4,GUA,4


In [13]:
df = pd.merge(df, teams, left_on='home', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
df.head()

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
0,AMÉ,ATL,0,2,0,1
1,AMÉ,ASL,2,3,0,2
2,AMÉ,CAZ,0,0,0,3
3,AMÉ,GUA,0,0,0,4
4,AMÉ,JUÁ,3,0,0,5


In [14]:
df.iloc[6]

home          AMÉ
away          MAZ
home_score      2
away_score      0
i_home          0
i_away          7
Name: 6, dtype: object

In [15]:
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.unique())
num_games = len(home_team)

In [16]:
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())

In [17]:
df.home_score.values

array(['0', '2', '0', '0', '3', '2', '2', '0', '2', '1', '2', '1', '2',
       '2', '3', '1', '2', '0', '1', '0', '1', '2', '2', '1', '2', '2',
       '0', '0', '2', '2', '0', '0', '1', '0', '0', '2', '0', '2', '0',
       '2', '1', '1', '0', '0', '2', '1', '1', '4', '0', '0', '2', '2',
       '1', '0', '0', '1', '0', '0', '1', '1', '1', '1', '2', '1', '2',
       '4', '1', '2', '0', '0', '1', '1', '2', '0', '3', '1', '2', '1',
       '2', '1', '1', '2', '2', '1', '3', '1', '1', '1', '2', '1', '0',
       '0', '3', '2', '1', '0', '0', '0', '1', '1', '2', '0', '1', '1',
       '0', '0', '2', '0', '3', '0', '3', '2', '0', '1', '1', '2', '4',
       '0', '1', '2', '1', '2', '1', '0', '3', '1', '1', '0', '2', '2',
       '2', '1', '2', '1', '0', '2', '2', '0', '0', '2', '0', '3', '0',
       '2', '0', '3', '1', '0', '1', '2', '2', '2', '2', '0', '0', '4',
       '1', '0', '1', '0', '2', '0', '1', '0', '1', '0', '3', '0', '2',
       '3', '1', '0', '0', '1', '2', '1', '4', '3', '3', '1', '1

https://www.pinnacle.com/es/betting-articles/Soccer/how-to-calculate-poisson-distribution/MD62MLXUMKMXZ6A8

In [18]:
sumatoria_home = np.sum(df.home_score.astype(int))
sumatoria_home

389

In [19]:
sumatoria_away = np.sum(df.away_score.astype(int))
sumatoria_away

332

In [20]:
sumatoria_liga = sumatoria_home + sumatoria_away
sumatoria_liga

721

In [21]:
F_att = sumatoria_home / num_games
F_def = sumatoria_away / num_games
print(F_att)
print(F_def)

1.2712418300653594
1.0849673202614378


In [22]:
media_home = np.mean(df.home_score.astype(int))
media_home

1.2712418300653594

In [23]:
media_away = np.mean(df.away_score.astype(int))
media_away

1.0849673202614378

In [56]:
# Número de goles marcado como local por el equipo
ht = df[df["home"]=='LEÓ']
ht

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
102,LEÓ,AMÉ,1,1,6,0
103,LEÓ,ATL,1,1,6,1
104,LEÓ,ASL,0,0,6,2
105,LEÓ,CAZ,0,1,6,3
106,LEÓ,GUA,2,1,6,4
107,LEÓ,JUÁ,0,1,6,5
108,LEÓ,MAZ,3,0,6,7
109,LEÓ,MON,0,0,6,8
110,LEÓ,NEC,3,0,6,9
111,LEÓ,PAC,2,1,6,10


In [57]:
media_ht = np.mean(ht.home_score.astype(int))
media_ht

1.2352941176470589

In [58]:
Fatt_ht = media_ht / media_home
Fatt_ht

0.9717223650385605

In [59]:
## Número de goles marcado como visitante por el equipo 
at = df[df["away"]=='ATL']
at

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
0,AMÉ,ATL,0,2,0,1
35,ASL,ATL,2,6,2,1
52,CAZ,ATL,1,0,3,1
69,GUA,ATL,0,1,4,1
86,JUÁ,ATL,1,2,5,1
103,LEÓ,ATL,1,1,6,1
120,MAZ,ATL,1,0,7,1
137,MON,ATL,0,0,8,1
154,NEC,ATL,0,3,9,1
171,PAC,ATL,0,1,10,1


In [60]:
media_at = np.mean(ht.away_score.astype(int))
media_at

1.1176470588235294

In [61]:
Fdef_at = media_at / media_away
Fdef_at

1.030120481927711

In [62]:
# Número probable de goles que marque el León
g_ht = Fatt_ht * Fdef_at * media_home
g_ht

1.272501771793055

In [63]:
# Pronostico de goles del América
g_at = Fdef_at * Fatt_ht * media_away
g_at

1.0860426432783912

In [64]:
from scipy.stats import poisson

In [67]:
for i in range(8):
    print(f'LEO {i} goles {poisson.pmf(k= i, mu=g_ht)}')
    print(f'ATL {i} goles {poisson.pmf(k= i, mu=g_at)}')

LEO 0 goles 0.2801299232591842
ATL 0 goles 0.33754965847243845
LEO 1 goles 0.3564658236795644
ATL 1 goles 0.3665933233251253
LEO 2 goles 0.22680169610795822
ATL 2 goles 0.1990679909361145
LEO 3 goles 0.0962018533810156
ATL 3 goles 0.07206544235612554
LEO 4 goles 0.030604257219279507
ATL 4 goles 0.019566535876368266
LEO 5 goles 0.007788794307188716
ATL 5 goles 0.0042500184685944934
LEO 6 goles 0.0016518757593382148
ATL 6 goles 0.0007692835486023905
LEO 7 goles 0.00030028783293426805
ATL 7 goles 0.00011935353407924586


In [68]:
print(f'LEO {0} goles {poisson.pmf(k= 0, mu=g_ht)}')
print(f'ATL {0} goles {poisson.pmf(k= 0, mu=g_at)}')

LEO 0 goles 0.2801299232591842
ATL 0 goles 0.33754965847243845
