### Construcción de un Dataset de la Liga Méxicana de Fútbol
#### a partir de la información de Wikipedia
https://en.wikipedia.org/wiki/2021–22_Liga_MX_season

Instrucciones:
1. De la página de wikipedia busque la tabla de resultados
2. Copie y pegue en una Hoja de Cálculo de Google Drive
3. Descargue en formato .CSV 
4. Renombre el archivo con las siguientes características FMF_T(A|C)_(año4-año2).csv.  Ejemplo: FMF_TA_2021-22.csv
5. Copie el archivo en la carpeta /ligas/data (importante cree las carpetas antes de este paso)

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
DATA_DIR = os.path.join(os.getcwd(), 'data/')
CHART_DIR = os.path.join(os.getcwd(), 'charts/')

In [5]:
DATA_DIR

'/Users/rmrodriguez/Documents/anaconda/premiereLeague/data/'

In [6]:
data_file = './data/FMF_TA_2021.csv'
df = pd.read_csv(data_file, index_col=0)
df.head()

Unnamed: 0_level_0,AMÉ,ATL,ASL,CAZ,GUA,JUÁ,LEÓ,MAZ,MON,NEC,PAC,PUE,QUE,SAN,TIJ,TOL,UNL,UNM
Home \ Away,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
América,—,—,—,—,0–0,—,—,2–0,0–0,2–1,—,2–0,—,2–1,2–0,—,1–0,2–0
Atlas,0–1,—,—,0–0,—,2–0,2–0,—,2–1,—,—,0–1,2–0,—,0–2,0–0,—,—
Atlético San Luis,0–1,2–6,—,0–0,—,—,—,—,1–1,0–2,—,—,1–1,—,4–1,—,0–3,—
Cruz Azul,2–1,—,—,—,—,—,0–1,0–2,1–1,—,1–1,—,2–0,—,—,4–0,1–1,3–4
Guadalajara,—,0–1,1–2,1–1,—,2–2,0–3,—,—,2–1,1–0,—,—,—,—,2–0,—,—


In [7]:
print(df.columns)

Index(['AMÉ', 'ATL', 'ASL', 'CAZ', 'GUA', 'JUÁ', 'LEÓ', 'MAZ', 'MON', 'NEC',
       'PAC', 'PUE', 'QUE', 'SAN', 'TIJ', 'TOL', 'UNL', 'UNM'],
      dtype='object')


In [8]:
df.index = df.columns
rows = []
for i in df.index:
    for c in df.columns:
        if i == c: continue
        score = df.loc[i, c]
        if score == '—': continue
        ssplit = score.split('–')
        #print(ssplit[1])
        rows.append([i, c, ssplit[0], ssplit[1]])
df = pd.DataFrame(rows, columns = ['home', 'away', 'home_score', 'away_score'])
df.head()

Unnamed: 0,home,away,home_score,away_score
0,AMÉ,GUA,0,0
1,AMÉ,MAZ,2,0
2,AMÉ,MON,0,0
3,AMÉ,NEC,2,1
4,AMÉ,PUE,2,0


In [9]:
teams = df.home.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
teams.head()

Unnamed: 0,team,i
0,AMÉ,0
1,ATL,1
2,ASL,2
3,CAZ,3
4,GUA,4


In [10]:
df = pd.merge(df, teams, left_on='home', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
df.head()

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
0,AMÉ,GUA,0,0,0,4
1,AMÉ,MAZ,2,0,0,7
2,AMÉ,MON,0,0,0,8
3,AMÉ,NEC,2,1,0,9
4,AMÉ,PUE,2,0,0,11


In [11]:
df.iloc[6]

home          AMÉ
away          TIJ
home_score      2
away_score      0
i_home          0
i_away         14
Name: 6, dtype: object

In [12]:
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.unique())
num_games = len(home_team)

In [13]:
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())

In [14]:
df.home_score.values

array(['0', '2', '0', '2', '2', '2', '2', '1', '2', '0', '0', '2', '2',
       '2', '0', '2', '0', '0', '0', '2', '0', '1', '0', '1', '4', '0',
       '2', '0', '0', '1', '1', '2', '4', '1', '3', '0', '1', '1', '2',
       '0', '2', '1', '2', '1', '1', '2', '3', '0', '0', '1', '1', '1',
       '0', '0', '3', '3', '1', '2', '1', '1', '2', '0', '3', '1', '2',
       '2', '0', '2', '0', '0', '0', '3', '1', '2', '2', '2', '0', '1',
       '1', '2', '0', '0', '3', '3', '1', '0', '0', '1', '4', '1', '1',
       '1', '1', '2', '1', '0', '0', '2', '1', '1', '2', '1', '1', '0',
       '1', '0', '1', '3', '0', '2', '1', '1', '0', '1', '0', '2', '1',
       '1', '1', '2', '0', '0', '0', '2', '3', '1', '2', '0', '1', '3',
       '1', '0', '2', '1', '1', '3', '2', '1', '2', '3', '2', '0', '3',
       '3', '1', '1', '0', '1', '4', '0', '1', '2', '0', '0', '3'],
      dtype=object)

https://www.pinnacle.com/es/betting-articles/Soccer/how-to-calculate-poisson-distribution/MD62MLXUMKMXZ6A8

In [15]:
sumatoria_home = np.sum(df.home_score.astype(int))
sumatoria_home

192

In [16]:
sumatoria_away = np.sum(df.away_score.astype(int))
sumatoria_away

153

In [17]:
sumatoria_liga = sumatoria_home + sumatoria_away
sumatoria_liga

345

In [18]:
F_att = sumatoria_home / num_games
F_def = sumatoria_away / num_games
print(F_att)
print(F_def)

1.238709677419355
0.9870967741935484


In [19]:
media_home = np.mean(df.home_score.astype(int))
media_home

1.238709677419355

In [20]:
media_away = np.mean(df.away_score.astype(int))
media_away

0.9870967741935484

In [21]:
# Número de goles marcado como local por el equipo 'LEÓ' pos 6
ht = df[df["home"]=='SAN']
ht

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
111,SAN,ATL,1,1,13,1
112,SAN,ASL,0,0,13,2
113,SAN,CAZ,1,1,13,3
114,SAN,GUA,0,0,13,4
115,SAN,JUÁ,2,0,13,5
116,SAN,MAZ,1,0,13,7
117,SAN,MON,1,2,13,8
118,SAN,PUE,1,1,13,11
119,SAN,TOL,2,2,13,15


In [22]:
media_ht = np.mean(ht.home_score.astype(int))
media_ht

1.0

In [23]:
Fatt_ht = media_ht / media_home
Fatt_ht

0.8072916666666666

In [24]:
## Número de goles marcado como visitante por el equipo 'AMÉ' pos 0
at = df[df["away"]=='LEÓ']
at

Unnamed: 0,home,away,home_score,away_score,i_home,i_away
12,ATL,LEÓ,2,0,1,6
27,CAZ,LEÓ,0,1,3,6
39,GUA,LEÓ,0,3,4,6
69,MON,LEÓ,0,1,8,6
88,PAC,LEÓ,4,0,10,6
96,PUE,LEÓ,0,1,11,6
105,QUE,LEÓ,0,1,12,6
131,TOL,LEÓ,0,0,15,6
140,UNL,LEÓ,2,2,16,6


In [25]:
media_at = np.mean(ht.away_score.astype(int))
media_at

0.7777777777777778

In [26]:
Fdef_at = media_at / media_away
Fdef_at

0.7879448075526507

In [27]:
# Número probable de goles que marque el León
g_ht = Fatt_ht * Fdef_at * media_home
g_ht

0.7879448075526507

In [28]:
# Pronostico de goles del América
g_at = Fdef_at * Fatt_ht * media_away
g_at

0.6278935185185185

In [29]:
from scipy.stats import poisson

In [30]:
for i in range(5):
    print(f'SAN {i} goles {poisson.pmf(k= i, mu=g_ht)}')
    print(f'LEO {i} goles {poisson.pmf(k= i, mu=g_at)}')

SAN 0 goles 0.4547784928132986
LEO 0 goles 0.5337148782283092
SAN 1 goles 0.3583403519988591
LEO 1 goles 0.33511611277645575
SAN 2 goles 0.14117620984704507
LEO 2 goles 0.10520861758172868
SAN 3 goles 0.03707968716631418
LEO 3 goles 0.02201993635728696
SAN 4 goles 0.007304186742093477
LEO 4 goles 0.00345654382923269


In [31]:
print(f'SAN {0} goles {poisson.pmf(k= 0, mu=g_ht)}')
print(f'LEO {0} goles {poisson.pmf(k= 0, mu=g_at)}')

SAN 0 goles 0.4547784928132986
LEO 0 goles 0.5337148782283092
