In [74]:
import pandas as pd
import numpy as np
import psycopg2
from scipy import stats

In [75]:
conn = psycopg2.connect('dbname=football_db')
cur = conn.cursor()

columns = ['id', 'home_goal', 'away_goal', 'result']

query = f"""
SELECT match_api_id, home_team_goal, away_team_goal, 
CASE WHEN home_team_goal > away_team_goal THEN 1 
         ELSE 0 END as Result
FROM Match
"""

cur.execute(query)
data = cur.fetchall()

In [76]:
df = pd.DataFrame(data, columns=columns)

total_wins = len(df[df['result'] == 1])
total_games = len(df['result'])

In [77]:
mu = total_wins / total_games

In [78]:
sample_size = 2000
number_of_games = 2000
samples = np.zeros(sample_size)
for i in range(sample_size):
    """Taking a sample size of the win rate for home games"""
    games = np.random.choice(df['result'], size=number_of_games)
    win_rate = games.sum() /number_of_games
    samples[i] = win_rate

In [85]:
samples.mean()

0.45875925

In [91]:
bootstrap_samples = []
bootstrap_sample_means = np.zeros(500)
bootstrap_sample_95pcts = np.zeros(500)
for i in range(500):
    bootstrap_sample = np.random.choice(samples, size=500)
    bootstrap_samples.append(bootstrap_sample)
    bootstrap_sample_means[i] = bootstrap_sample.mean()
    bootstrap_sample_95pct = np.percentile(a=bootstrap_sample, q=95)
    bootstrap_sample_95pcts[i] = bootstrap_sample_95pct

In [92]:
bootstrap_sample_means[:10]

array([0.45819 , 0.457822, 0.459051, 0.459243, 0.459073, 0.459129,
       0.459584, 0.458594, 0.458911, 0.458979])

In [93]:
bootstrap_sample_95pcts[:10]

array([0.476025, 0.4755  , 0.4765  , 0.4785  , 0.4785  , 0.48    ,
       0.478025, 0.478025, 0.479   , 0.475025])

In [94]:
cf_low = np.percentile(a=bootstrap_sample_95pcts, q=2.5)
cf_high = np.percentile(a=bootstrap_sample_95pcts, q=97.5)

In [95]:
print(f'Confidence Interval: Between {cf_low} and {cf_high}')

Confidence Interval: Between 0.475025 and 0.48
