In [276]:
import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd
import time
import random
import os
import shutil

In [263]:
SLEEP_UPPER_BOUND = 5

## Find individual match links per world cup

In [215]:
def find_scorecard_links(year_fp):
    time.sleep(random.randint(0,SLEEP_UPPER_BOUND))
    
    page = requests.get(year_fp)
    assert(page.status_code == 200)
    
    soup = BeautifulSoup(page.content, "html.parser")
    scorecard_tags = soup.find_all(lambda tag: tag.get("class") == ["match-info-link-FIXTURES"])
    
    scorecard_links = []
    site_prefix = "https://www.espncricinfo.com"
    for link in scorecard_tags:
        scorecard_links.append(site_prefix + link.get("href"))

    return scorecard_links

## Find results per match

In [345]:
def find_full_match_results(scorecard_fp):
    time.sleep(random.randint(0,SLEEP_UPPER_BOUND))
    
    page = requests.get(scorecard_fp)
    assert(page.status_code == 200)
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    match_info = soup.find("div", {"class":"match-info match-info-MATCH"})
    status = match_info.find("div", {"class":"status"})
    
    match_result_dict = {}
    match_result_dict["result"] = status.text
    match_result_dict["batsman"] = find_result_df(soup, "table batsman", 8)
    match_result_dict["bowler"] = find_result_df(soup, "table bowler", 11)
    
    return match_result_dict

In [341]:
def find_result_df(soup, class_name, expected_num_cols):
    time.sleep(random.randint(0,SLEEP_UPPER_BOUND))
    
    tables = soup.find_all("table", {"class":class_name})
    
    full_results = []
    for table in tables:
        table_rows = table.find_all("tr")

        result = []
        for tr in table_rows:
            tags = tr.find_all(["th", "td"])

            row = []
            for i in tags:
                text = i.text
                text = unicodedata.normalize("NFKD", text)
                text = text.encode("ascii", errors="ignore").decode()
                text = text.replace("(c)", "")
                text = text.strip()
                row.append(text)

            if len(row) == expected_num_cols:
                result.append(row)

        result_df = pd.DataFrame(result[1:], columns=result[0])
        full_results.append(result_df)

    return full_results

### Test individual match results

In [346]:
find_full_match_results("https://www.espncricinfo.com/series/icc-cricket-world-cup-2019-1144415/south-africa-vs-west-indies-15th-match-1144497/full-scorecard")

{'result': 'no result',
 'batsman': [           BATSMEN                       R   B   M 4s 6s     SR
  0  Quinton de Kock             not out  17  21  37  1  0  80.95
  1      Hashim Amla  c Gayle b Cottrell   6   7  13  1  0  85.71
  2    Aiden Markram   c Hope b Cottrell   5  10  16  1  0  50.00
  3   Faf du Plessis             not out   0   7   9  0  0   0.00],
 'bowler': [            BOWLING    O  M   R  W  ECON  0s 4s 6s WD NB
  0  Sheldon Cottrell    4  1  18  2  4.50  17  2  0  0  0
  1       Kemar Roach    3  0  10  0  3.33  13  1  0  1  0
  2     Oshane Thomas  0.3  0   1  0  2.00   2  0  0  0  0]}

In [347]:
find_full_match_results("https://www.espncricinfo.com/series/icc-cricket-world-cup-2019-1144415/india-vs-new-zealand-18th-match-1144500/full-scorecard")

{'result': 'abandoned', 'batsman': [], 'bowler': []}

In [352]:
find_full_match_results("https://www.espncricinfo.com/series/icc-cricket-world-cup-2019-1144415/australia-vs-sri-lanka-20th-match-1144502/full-scorecard")

{'result': 'result',
 'batsman': [          BATSMEN                           R    B    M  4s 6s      SR
  0    David Warner             b de Silva   26   48   70   2  0   54.17
  1     Aaron Finch  c Karunaratne b Udana  153  132  176  15  5  115.91
  2   Usman Khawaja     c Udana b de Silva   10   20   21   1  0   50.00
  3    Steven Smith              b Malinga   73   59   90   7  1  123.73
  4   Glenn Maxwell                not out   46   25   40   5  1  184.00
  5     Shaun Marsh  c Siriwardana b Udana    3    9   19   0  0   33.33
  6      Alex Carey        run out (Udana)    4    3    8   0  0  133.33
  7     Pat Cummins        run out (Udana)    0    1    4   0  0    0.00
  8  Mitchell Starc                not out    5    4    7   0  0  125.00,
                  BATSMEN                           R    B    M 4s 6s      SR
  0    Dimuth Karunaratne  c Maxwell b Richardson  97  108  132  9  0   89.81
  1          Kusal Perera                 b Starc  52   36   64  5  1  144.44
  2

# Loop over all world cups

In [348]:
world_cup_links = [
    "https://www.espncricinfo.com/series/icc-cricket-world-cup-2019-1144415/match-results",
]

In [349]:
year_labels = [
    "2019",
]

In [350]:
world_cup_results = {}
for year_fp, year_label in zip(world_cup_links, year_labels):
    scorecard_links = find_scorecard_links(year_fp)
    
    year_results = []
    for scorecard_fp in scorecard_links:
        year_results.append(find_full_match_results(scorecard_fp))

    world_cup_results[year_label] = year_results

### Save world cup results data

In [351]:
world_cup_results

{'2019': [{'result': 'result',
   'batsman': [               BATSMEN                              R   B    M 4s 6s      SR
    0       Martin Guptill               lbw b Woakes  19  18   30  2  1  105.56
    1       Henry Nicholls                 b Plunkett  55  77  123  4  0   71.43
    2      Kane Williamson       c Buttler b Plunkett  30  53   77  2  0   56.60
    3          Ross Taylor                 lbw b Wood  15  31   47  0  0   48.39
    4           Tom Latham  c sub (JM Vince) b Woakes  47  56  104  2  1   83.93
    5        James Neesham          c Root b Plunkett  19  25   28  3  0   76.00
    6  Colin de Grandhomme  c sub (JM Vince) b Woakes  16  28   38  0  0   57.14
    7     Mitchell Santner                    not out   5   9   19  0  0   55.56
    8           Matt Henry                   b Archer   4   2    7  1  0  200.00
    9          Trent Boult                    not out   1   2    3  0  0   50.00,
               BATSMEN                                  R   B    M

In [359]:
OVERWRITE_DATA = True

if OVERWRITE_DATA:
    shutil.rmtree("data/world_cup")
    
if not os.path.isdir("data/world_cup"):
    os.mkdir("data/world_cup")
    
    for year, matches in world_cup_results.items():
        for match_number, match in enumerate(matches):
            
            # skip if there is no result for the match (e.g. match abandoned)
            if match["result"] != "result":
                continue
                
            save_dir = f"data/world_cup/{year}/{match_number}/"
            
            # make directory if it non-existent
                if not os.path.isdir(save_dir):
                    os.makedirs(save_dir + "batsman/")
                    os.makedirs(save_dir + "bowler/")
            
            
                
            for match_key, match_value in match.items():
                print(match_key, match_value)
                # break 
                if match_key == "result":
                    if match_value != "result":
                        break
                

                

                

                # save both innings dataframes to directory
                print(save_dir)
                match_value[0].to_csv(save_dir + "innings_A.csv", index=False)
                match_value[1].to_csv(save_dir + "innings_B.csv", index=False)
                break
            break
        break

result result
data/world_cup/2019/0/result/


AttributeError: 'str' object has no attribute 'to_csv'

In [320]:
world_cup_results[2019][33]

{'batsman': [           BATSMEN                       R   B   M 4s 6s     SR
  0  Quinton de Kock             not out  17  21  37  1  0  80.95
  1      Hashim Amla  c Gayle b Cottrell   6   7  13  1  0  85.71
  2    Aiden Markram   c Hope b Cottrell   5  10  16  1  0  50.00
  3   Faf du Plessis             not out   0   7   9  0  0   0.00],
 'bowler': [            BOWLING    O  M   R  W  ECON  0s 4s 6s WD NB
  0  Sheldon Cottrell    4  1  18  2  4.50  17  2  0  0  0
  1       Kemar Roach    3  0  10  0  3.33  13  1  0  1  0
  2     Oshane Thomas  0.3  0   1  0  2.00   2  0  0  0  0]}