# Lecture 22

### Lecture 22 실습 코드

참고자료: 

https://towardsdatascience.com/predicting-the-fifa-world-cup-2022-with-a-simple-model-using-python-6b34bdd4f2a5


In [None]:
import os
from google.colab import drive

drive.mount('/content/gdrive')

%cd /content/gdrive/MyDrive/ITEC419-fa22/lec

Mounted at /content/gdrive
/content/gdrive/MyDrive/ITEC419-fa22/lec


In [None]:
from datascience import *
import numpy as np
from scipy.stats import poisson

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
fifa = Table.read_table('fifa_worldcup_matches.csv')
fifa2022 = Table.read_table('fifa_worldcup_2022.csv')

### **Calculate Team Strength**

In [None]:
fifa.show(3)

In [None]:
home = Table().with_columns(
    'team', fifa.column('home'),
    'goalsScored', fifa.column('homeGoals'),
    'goalsConceded', fifa.column('awayGoals'))
away = Table().with_columns(
    'team', fifa.column('away'),
    'goalsScored', fifa.column('awayGoals'),
    'goalsConceded', fifa.column('homeGoals'))

allteam = home.append(away)
allteam.show(5)

### **Poisson Distribution**

In [None]:
allteam.where('team', 'Brazil')

In [None]:
brazil_games = allteam.where('team', 'Brazil')
brazil_scored_mean = np.mean(brazil_games.column('goalsScored'))
brazil_conceded_mean = np.mean(brazil_games.column('goalsConceded'))
brazil_num_games = brazil_games.num_rows
print(brazil_scored_mean, brazil_conceded_mean, brazil_num_games)


In [None]:
brazil_goals = Table().with_columns(
    'goals', np.arange(0, 11, 1),
    'scored', np.array([brazil_games.where('goalsScored', x).num_rows / brazil_num_games for x in range(11)]),
    'estScored', np.array([poisson.pmf(x, brazil_scored_mean) for x in range(11)]),
    'conceded', np.array([brazil_games.where('goalsConceded', x).num_rows / brazil_num_games for x in range(11)]),
    'estConceded', np.array([poisson.pmf(x, brazil_conceded_mean) for x in range(11)])
)

brazil_goals.show()

goals,scored,estScored,conceded,estConceded
0,0.137615,0.122344,0.422018,0.38163
1,0.275229,0.257035,0.33945,0.367626
2,0.229358,0.270005,0.155963,0.177067
3,0.165138,0.189086,0.0550459,0.0568565
4,0.12844,0.0993134,0.00917431,0.0136925
5,0.0366972,0.0417299,0.00917431,0.00263801
6,0.0183486,0.0146118,0.0,0.000423533
7,0.00917431,0.00438546,0.00917431,5.82844e-05
8,0.0,0.00115169,0.0,7.01819e-06
9,0.0,0.000268844,0.0,7.51182e-07


In [None]:
brazil_goals.bar('goals', make_array(1, 2))
brazil_goals.bar('goals', make_array(3, 4))

In [None]:
team_strength = allteam.group('team', np.mean)
team_strength

team,goalsScored mean,goalsConceded mean
Algeria,1.0,1.46154
Angola,0.333333,0.666667
Argentina,1.69136,1.14815
Australia,0.8125,1.9375
Austria,1.48276,1.62069
Belgium,1.41667,1.5
Bolivia,0.166667,3.33333
Bosnia and Herzegovina,1.33333,1.33333
Brazil,2.10092,0.963303
Bulgaria,0.846154,2.03846


**.**

## **Define `calcJointProbabilities()`**

In [None]:
def calcJointProbabilities(home, away):
    pdf = Table(['hGoals', 'aGoals', 'probability'])

    if home in team_strength.column('team') and away in team_strength.column('team'):
        # goals_scored * goals_conceded
        lamb_home = (team_strength.where('team', home).column('goalsScored mean')
                    * team_strength.where('team', away).column('goalsConceded mean'))
        lamb_away = (team_strength.where('team', away).column('goalsScored mean')
                    * team_strength.where('team', home).column('goalsConceded mean'))

        for x in range(0, 11): # number of goals home team
            for y in range(0, 11): # number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                pdf.append([x, y, p, accumulated])
    return pdf

### **Testing function**

In [None]:
pdf_UK_BR = calcJointProbabilities('England', 'Brazil')

In [None]:
pdf_UK_BR.show(5)

**.**

In [None]:
toss = np.random.random_sample((1,))


In [None]:
def getScores(pdf_match, n):
    toss = np.random.random_sample((n,))
    # too less data
    if pdf_match.num_rows < 5:
        if np.sum(toss) < n / 3:
            return 3
        elif np.sum(toss) < n * 2 / 3:
            return 0
        else:
            return -3

    simResult = 0
    for i in range(n):
        findScore = pdf_match.where('accumulated', are.above_or_equal_to(toss[i]))
        if findScore.num_rows == 0:
            return 0
        row = findScore.row(0)
        if row[0] > row[1]:
            simResult += 1
        elif row[0] < row[1]:
            simResult -= 1
    if simResult > 0:
        return 3
    elif simResult < 0:
        return -3
    else:
        return 0

In [None]:
getScores(pdf_UK_BR, 5)

In [None]:
numSim = 1
res = np.array([])
for i in range(1000):
    res = np.append(res, getScores(pdf_UK_BR, numSim))
resTable = Table().with_column('value', res)
resTable.hist(bins=np.arange(-3, 5, 1))

In [None]:
numSim = 3
res = np.array([])
for i in range(1000):
    res = np.append(res, getScores(pdf_UK_BR, numSim))
resTable = Table().with_column('value', res)
resTable.hist(bins=np.arange(-3, 5, 1))

In [None]:
numSim = 5
res = np.array([])
for i in range(1000):
    res = np.append(res, getScores(pdf_UK_BR, numSim))
resTable = Table().with_column('value', res)
resTable.hist(bins=np.arange(-3, 5, 1))

##**Predicting World Cup 2022**

###**Group Stage**

In [None]:
# create the group table
def create_group():
    groupTbl = Table(['group', 'team', 'pts'])
    for i in range(8):
        gTbl = Table().with_columns(
            'group', np.array(['Group ' + chr(ord('A')+i)]*4),
            'team', fifa2022.take[i*6:(i+1)*6].group('home').column('home'),
            'pts', np.array([0]*4)
            )
        groupTbl = groupTbl.append(gTbl)

    return groupTbl.to_df()

In [None]:
df_group = create_group()
df_group

In [None]:
# simulation of Group Stage
def simul_group_stage(df_group, n):
    for i in range(48):
        home = fifa2022.take[i].column('home')[0]
        away = fifa2022.take[i].column('away')[0]
        pdf_match = calcJointProbabilities(home, away)
        res = getScores(pdf_match, n)
        idxhome = df_group[df_group.team == home].index.values[0]
        idxaway = df_group[df_group.team == away].index.values[0]
        if res == 0:
            df_group.loc[idxhome, 'pts'] += 1
            df_group.loc[idxaway, 'pts'] += 1
        elif res > 0:
            df_group.loc[idxhome, 'pts'] += 3
        else:
            df_group.loc[idxaway, 'pts'] += 3
    gTbl = Table().from_df(df_group)
    return gTbl

In [None]:
groupTbl = simul_group_stage(df_group, 1)
groupTbl

###**Knock out**

In [None]:
knockout = fifa2022.take[48:56].with_column('winner', np.array(['?']*8))
df_knockout = knockout.to_df()
df_knockout

In [None]:
groupTbl.group('group').column('group')

In [None]:
def update_knockout(groupTbl, df_knockout):
    for group in groupTbl.group('group').column('group'):
        tbl = groupTbl.where('group', group).sort('pts', descending=True)
        group_winner = tbl.row(0)[1]
        runners_up = tbl.row(1)[1]
        if tbl.row(1)[2] == tbl.row(2)[2]:
            away = tbl.row(2)[1]
            pdf_match = calcJointProbabilities(runners_up, away)
            while True:
                res = getScores(pdf_match, 1)
                if res != 0:
                    break
            if res < 0:
                runners_up = away
        #print(tbl)
        #print(group_winner, runners_up)
        #print(f'Winners {group}', f'Runners-up {group}')
        df_knockout.replace({f'Winners {group}': group_winner,
                            f'Runners-up {group}': runners_up}, inplace=True)
    return df_knockout

update_knockout(groupTbl, df_knockout)

In [None]:
def get_winner(df_knockout):
    for index, row in df_knockout.iterrows():
        home, away = row['home'], row['away']
        pdf_match = calcJointProbabilities(home, away)
        while True:
            res = getScores(pdf_match, 1)
            if res != 0:
                break
        if res > 0:
            winner = home
        else:
            winner = away
        df_knockout.loc[index, 'winner'] = winner
    return df_knockout

In [1]:
get_winner(df_knockout)

NameError: ignored

###**Quarter Final**

In [None]:
quarter = fifa2022.take[56:60].with_column('winner', np.array(['?']*4))
df_quarter = quarter.to_df()
df_quarter

In [None]:
def update_table(df_round_1, df_round_2):
    for index, row in df_round_1.iterrows():
        winner = df_round_1.loc[index, 'winner']
        match = df_round_1.loc[index, 'score']
        df_round_2.replace({f'Winners {match}':winner}, inplace=True)
    return df_round_2

In [None]:
update_table(df_knockout, df_quarter)

In [None]:
get_winner(df_quarter)

###**Semifinal**

In [None]:
semi = fifa2022.take[60:62].with_column('winner', np.array(['?']*2))
df_semi = semi.to_df()
df_semi

In [None]:
update_table(df_quarter, df_semi)

In [None]:
get_winner(df_semi)

###**Final**

In [None]:
final = fifa2022.take[63:64].with_column('winner', np.array(['?']))
df_final = final.to_df()
df_final

In [None]:
update_table(df_semi, df_final)

In [None]:
get_winner(df_final)

In [None]:
df_final.loc[0, 'winner']

### **Simulate World Cup 2022**

In [None]:
def sim_worldcup2022(n): # n is the number of simulate games for each match
    df_group = create_group()
    groupTbl = simul_group_stage(df_group, n)

    knockout = fifa2022.take[48:56].with_column('winner', np.array(['?']*8))
    df_knockout = knockout.to_df()
    update_knockout(groupTbl, df_knockout)
    get_winner(df_knockout)

    quarter = fifa2022.take[56:60].with_column('winner', np.array(['?']*4))
    df_quarter = quarter.to_df()
    update_table(df_knockout, df_quarter)
    get_winner(df_quarter)

    semi = fifa2022.take[60:62].with_column('winner', np.array(['?']*2))
    df_semi = semi.to_df()
    update_table(df_quarter, df_semi)
    get_winner(df_semi)

    final = fifa2022.take[63:64].with_column('winner', np.array(['?']))
    df_final = final.to_df()
    update_table(df_semi, df_final)
    get_winner(df_final)

    return df_final.loc[0, 'winner']

In [None]:
result = make_array()
for i in range(100):
    print('.', end='')
    result = np.append(result, sim_worldcup2022(1))
resTbl = Table().with_column('winner', result)

In [None]:
resTbl.group('winner').sort('count', descending=True).barh('winner')