In [5]:
import os
import time
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt 
import json
import re
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from pprint import pprint
from typing import Dict, Any, Tuple, Optional, Iterable
from urllib.parse import urljoin

from requests import Session
from bs4 import BeautifulSoup, Tag




In [9]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
url = "https://www.oddsportal.com/matches/"
#pd.read_html(link, match="table-matches")

In [None]:
options = Options()
options.headless = False
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)
driver.get(url)
wait = WebDriverWait(driver,10)
wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//*[contains(@class,"table-matches")]//tr')))
driver.find_element(By.XPATH, '//table-matches').text
driver.quit()

In [4]:
requests.get(url)

<Response [404]>

In [None]:
soup = BeautifulSoup(url, 'html.parser')

In [6]:

BASE = 'https://www.oddsportal.com'
TERRIBLE_TIMESTAMP_PAT = re.compile(r'^t(\d+)-')


In [7]:
def decode_terrible_timestamp_class(tag: Tag) -> datetime:
    for cls in tag['class']:
        match = TERRIBLE_TIMESTAMP_PAT.search(cls)
        if match:
            break
    else:
        raise ValueError(f'{tag} does not seem to contain a valid timestamp')

    stamp = int(match[1])
    return datetime.fromtimestamp(stamp, tz=timezone.utc)

In [8]:
# Refer to https://www.oddsportal.com/res/x/global-210609145530.js
# this.d = function (str)
ODDS_TABLE = str.maketrans(
    'axcteopzf',
    '1234567.|',
)

In [10]:
def decode_terrible_odds(decorated: Tag) -> Tuple[float, Optional[float]]:
    odds = decorated['xodd'].translate(ODDS_TABLE).split('|')

    if len(odds) == 1:
        return float(odds), None
    preferred, normal = odds
    return float(normal), float(preferred)


In [14]:

def start_search(session: Session, year: int) -> Dict[str, Any]:
    url = urljoin(BASE, f'/soccer/england/premier-league-{year}-{year+1}/results/')
    with session.get(url) as resp:
        resp.raise_for_status()
        doc = BeautifulSoup(resp.text, 'lxml')

    script_pat = re.compile(r'new PageTournament\(')
    for script in doc.select('body > script'):
        match = script_pat.search(script.string)
        if match:
            break
    else:
        raise ValueError('ID script not found')

    start = match.end()
    end = script.string.find('}', start) + 1
    json_str = script.string[start: end]
    ids = json.loads(json_str)
    return ids

In [13]:

def get_page(session: Session, sid: int, id: str, page: int) -> Iterable[GameData]:
    bookie_hash = 'X0'
    use_premium = 1
    timezone_offset = 0

    archive_url = urljoin(
        BASE,
        f'/ajax-sport-country-tournament-archive'
        f'/{sid}'
        f'/{id}'
        f'/{bookie_hash}'
        f'/{use_premium}'
        f'/{timezone_offset}'
        f'/{page}'
        f'/'
    )

    with session.get(archive_url) as resp:
        resp.raise_for_status()
        start = resp.text.find('{')
        end = resp.text.rfind('}') + 1
        json_str = resp.text[start: end]

    html = json.loads(json_str)['d']['html']
    doc = BeautifulSoup(html, 'lxml')

    head = doc.find('th')
    sport, country, season = head.find_all('a')

    for tr in doc.find_all('tr'):
        date_header = tr.select_one('th[colspan="3"]')
        if date_header:
            date_span = date_header.find('span')
            continue

        if tr.select_one('td.table-time'):
            yield GameData.from_tags(
                sport, country, season, date_span,
                *tr.find_all('td'),
            )

In [12]:
@dataclass
class GameData:
    sport: str
    sport_path: str
    country: str
    country_path: str
    season: str
    season_path: str

    when: datetime
    path: str

    home_team: str
    away_team: str
    home_team_won: bool
    away_team_won: bool
    home_score: int
    away_score: int

    home_odds: float
    home_odds_preferred: float
    draw_odds: float
    draw_odds_preferred: float
    away_odds: float
    away_odds_preferred: float

    bookmakers: int

    @classmethod
    def from_tags(
        cls,
        sport: Tag,
        country: Tag,
        season: Tag,
        date_span: Tag,
        time: Tag,
        teams: Tag,
        score: Tag,
        home_odds: Tag,
        draw_odds: Tag,
        away_odds: Tag,
        bookmakers: Tag,
    ):
        home_score, away_score = score.text.split(':')

        when = datetime.combine(
            decode_terrible_timestamp_class(date_span).date(),
            decode_terrible_timestamp_class(time).time(),
            tzinfo=timezone.utc,
        )

        team_anchor = teams.find('a')
        any_span = team_anchor.find('span')
        if any_span:
            home_team, away_team = (
                t.text if isinstance(t, Tag) else t.strip('- ')
                for t in team_anchor.children
            )
            home_team_won, away_team_won = (
                isinstance(t, Tag)
                for t in team_anchor.children
            )
        else:
            home_team, away_team = team_anchor.text.split('-')
            home_team_won, away_team_won = False, False

        home_odds_norm, home_odds_pref = decode_terrible_odds(home_odds)
        draw_odds_norm, draw_odds_pref = decode_terrible_odds(draw_odds)
        away_odds_norm, away_odds_pref = decode_terrible_odds(away_odds)

        game = cls(
            sport=sport.text.strip(),
            sport_path=sport['href'],
            country=country.text.strip(),
            country_path=country['href'],
            season=season.text.strip(),
            season_path=season['href'],
            when=when,
            path=team_anchor['href'],
            home_team=home_team,
            away_team=away_team,
            home_team_won=home_team_won,
            away_team_won=away_team_won,
            home_score=int(home_score),
            away_score=int(away_score),
            home_odds=home_odds_norm,
            home_odds_preferred=home_odds_pref,
            draw_odds=draw_odds_norm,
            draw_odds_preferred=draw_odds_pref,
            away_odds=away_odds_norm,
            away_odds_preferred=away_odds_pref,
            bookmakers=int(bookmakers.text),
        )
        return game

In [11]:


def main():
    with Session() as session:
        session.headers = {'User-Agent': 'Mozilla/5.0'}

        ids = start_search(session, year=2020)

        for page in range(1, 3):
            for game in get_page(session, ids['sid'], ids['id'], page):
                pprint(asdict(game))


In [None]:

if __name__ == '__main__':
    main()

In [None]:
odds = soup.find_all('td', class_="name table-participant")
for el in odds:
    links_in_first_column = el.find_all('a')
    match_name = ''.join(map(lambda e : e.text.strip(), links_in_first_column))
    print(match_name)

    odds_columns = el.find_next_siblings('td', xodd=True)
    print (odds_columns[0]['xodd'])
    print (odds_columns[1]['xodd'])
    print (odds_columns[2]['xodd'])