In [1]:
import io
import os
import typing
import time
import random
from typing import Union, Tuple, List, Literal
import pickle
from pathlib import Path
from pprint import pprint
import gc

from tqdm.auto import tqdm
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

from crawler import wait_random_time, parse_detailed_result

In [2]:
main_path = Path('../score_analysis')
data_path = main_path / 'data'

In [3]:
data_paths = list(data_path.glob('*'))
data_paths[:5]

[PosixPath('../score_analysis/data/ger-nathalie_weinzierl'),
 PosixPath('../score_analysis/data/jpn-mao_asada'),
 PosixPath('../score_analysis/data/jpn-kanako_murakami'),
 PosixPath('../score_analysis/data/jpn-akiko_suzuki'),
 PosixPath('../score_analysis/data/usa-gracie_gold')]

In [4]:
len(data_paths)

40

In [5]:
df = pd.read_csv(list(data_paths[0].glob("*.csv"))[0], index_col=0)
df.head()

Unnamed: 0,season,event-title,event-url,location,date,short-program-link,short-program-score,short-program-ranking,freeskating-link,freeskating-score,freeskating-ranking,final-link,final-score,final-ranking
0,2007/08,Cup of Nice,https://skatingscores.com/0708/cupnic/,Nice,2007-10-18,https://skatingscores.com/0708/cupnic/jr/women...,42.74,2J,https://skatingscores.com/0708/cupnic/jr/women...,65.75,5J,https://skatingscores.com/0708/cupnic/jr/women...,108.49,4J
1,2007/08,GER Nationals,https://skatingscores.com/0708/natger/,Dresden,2008-01-03,https://skatingscores.com/0708/natger/jr/women...,45.46,1J,https://skatingscores.com/0708/natger/jr/women...,68.36,4J,https://skatingscores.com/0708/natger/jr/women...,113.82,2J
2,2008/09,GER Nationals,https://skatingscores.com/0809/natger/,Oberstdorf,2008-12-18,https://skatingscores.com/0809/natger/sr/women...,37.9,9,https://skatingscores.com/0809/natger/sr/women...,58.69,15,https://skatingscores.com/0809/natger/sr/women...,96.59,13
3,2009/10,JGP BLR,https://skatingscores.com/0910/jgpblr/,Minsk,2009-09-23,https://skatingscores.com/0910/jgpblr/jr/women...,29.84,22J,https://skatingscores.com/0910/jgpblr/jr/women...,38.6,25J,https://skatingscores.com/0910/jgpblr/jr/women...,68.44,24J
4,2009/10,JGP TUR,https://skatingscores.com/0910/jgptur/,Istanbul,2009-10-14,https://skatingscores.com/0910/jgptur/jr/women...,29.88,29J,https://skatingscores.com/0910/jgptur/jr/women...,47.45,30J,https://skatingscores.com/0910/jgptur/jr/women...,77.33,28J


In [6]:
def get_and_parse(link: str):
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    return parse_detailed_result(soup)

def get_SP_FS_data(short_program_link: Union[str, float], freeskating_link: Union[str, float]):
    if short_program_link == '' or type(short_program_link) != str:
        short_program_data = None
    else:
        short_program_data = get_and_parse(short_program_link)
    if freeskating_link == '' or type(freeskating_link) != str:
        freeskating_data = None
    else:
        freeskating_data = get_and_parse(freeskating_link)
    return short_program_data, freeskating_data

In [7]:
def get_specific_score(player_name: str, df: pd.DataFrame, save: bool=True, target_path: Path=Path('.')) -> Tuple[pd.DataFrame]:
    result_data = dict()
    for i, row in tqdm(df.iterrows(), total=len(df), desc=player_name):
        short_program_link = row['short-program-link']
        freeskateing_link = row['freeskating-link']
        season = row['season']
        season = season[:season.find('/')]
        event_title = row['event-title']
        if (target_path / f'{player_name}-{season}-{event_title}.pkl').exists():
            with open(target_path / f'{player_name}-{season}-{event_title}.pkl', 'rb') as f:
                data = pickle.load(f)
        else:
            try:
                data = get_SP_FS_data(short_program_link, freeskateing_link)
            except Exception as e:
                print(short_program_link)
                print(freeskateing_link)
                raise e
        result_data['event_title'] = data
        wait_random_time(mean=1, min=0.5, max=2)
        if save:
            with open(target_path / f'{player_name}-{season}-{event_title}.pkl', 'wb') as f:
                pickle.dump(data, f)

    return result_data

In [8]:
for player_data_path in tqdm(data_paths, desc='Players'):
    if (player_data_path / 'DONE').exists():
        continue
    top_data_path = list(player_data_path.glob('*.csv'))[0]
    df = pd.read_csv(top_data_path, index_col=0)
    path_name = player_data_path.name
    player_name = player_data_path.name[path_name.find('-') + 1 : ]
    try:
        get_specific_score(player_name, df, save=True, target_path=player_data_path)
    except Exception as e:
        print(e)
        print(top_data_path) 
        raise e
    gc.collect()
    with open(player_data_path / 'DONE', 'w') as f:
        f.write(' ')

Players:   0%|          | 0/40 [00:00<?, ?it/s]

elene_gedevanishvili:   0%|          | 0/51 [00:00<?, ?it/s]

liam_firus:   0%|          | 0/35 [00:00<?, ?it/s]

daisuke_takahashi:   0%|          | 0/62 [00:00<?, ?it/s]

jorik_hendrickx:   0%|          | 0/55 [00:00<?, ?it/s]

anne_line_gjersem:   0%|          | 0/58 [00:00<?, ?it/s]

adelina_sotnikova:   0%|          | 0/35 [00:00<?, ?it/s]

denis_ten:   0%|          | 0/67 [00:00<?, ?it/s]

peter_liebers:   0%|          | 0/70 [00:00<?, ?it/s]

jenna_mccorkell:   0%|          | 0/57 [00:00<?, ?it/s]

ashley_wagner:   0%|          | 0/66 [00:00<?, ?it/s]

michael_christian_martinez:   0%|          | 0/44 [00:00<?, ?it/s]

misha_ge:   0%|          | 0/41 [00:00<?, ?it/s]

isadora_williams:   0%|          | 0/42 [00:00<?, ?it/s]

brooklee_han:   0%|          | 0/56 [00:00<?, ?it/s]

kaetlyn_osmond:   0%|          | 0/37 [00:00<?, ?it/s]

brian_joubert:   0%|          | 0/62 [00:00<?, ?it/s]

alexei_bychenko:   0%|          | 0/68 [00:00<?, ?it/s]

tatsuki_machida:   0%|          | 0/49 [00:00<?, ?it/s]