## Importing Libraries

In [38]:
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import time
import concurrent.futures
import modules.psql as psql

## Postgres Configuration

In [39]:
%run config_psql.ipynb

## Setting Configurations

In [40]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Initializing Parameters

In [41]:
min_year = 2024 #Parameterize this for receiving future incremental links
max_year = datetime.date.today().year

df_url = pd.DataFrame(columns=['url_type','season','series','match_id','url'])

test_series = [["2023%2F24", "ICC Cricket World Cup", "https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856/match-schedule-fixtures"]]

## Finding all Seasons

In [42]:
for year in range(min_year,max_year+1):
    new_row = {
        'url_type': 'season',
        'season': str(year),
        'url': "https://www.espncricinfo.com/ci/engine/series/index.html?season={};view=season".format(str(year))
    }
    df_url.loc[len(df_url)] = new_row
    
    if year < max_year:
        season_name = str(year) + "%2F" + '{:02d}'.format(year-1999)
        new_row = {
            'url_type': 'season',
            'season': season_name,
            'url': "https://www.espncricinfo.com/ci/engine/series/index.html?season={};view=season".format(season_name)
        }
        df_url.loc[len(df_url)] = new_row
df_url.head()

Unnamed: 0,url_type,season,series,match_id,url
0,season,2024,,,https://www.espncricinfo.com/ci/engine/series/...


## Finding all series in a season

In [43]:
for season in df_url['season'].values:
    season_engine_url = "https://www.espncricinfo.com/ci/engine/series/index.html?season={};view=season".format(season)
    response = requests.get(season_engine_url)
    soup = BeautifulSoup(response.content, 'lxml')
    for a in soup.find_all('a', href = True):
        if (('https://www.espncricinfo.com/series/' in a['href']) and ('/match-schedule-fixtures' in a['href'])):
            new_row = {
                'url_type': 'series',
                'season': season,
                'series': a.text,
                'url': a['href']
            }
            df_url.loc[len(df_url)] = new_row
            df_url.head()
df_url.head()

Unnamed: 0,url_type,season,series,match_id,url
0,season,2024,,,https://www.espncricinfo.com/ci/engine/series/...
1,series,2024,ICC World Test Championship,,https://www.espncricinfo.com/series/icc-world-...
2,series,2024,Botham-Richards Trophy,,https://www.espncricinfo.com/series/botham-ric...
3,series,2024,Sri Lanka in England Test Series,,https://www.espncricinfo.com/series/sri-lanka-...
4,series,2024,Australia in England ODI Series,,https://www.espncricinfo.com/series/australia-...


## Finding all matches in a series using Multi-Threading

### Function to find all urls

In [44]:
def insert_match_url(season, series, response):
    new_rows = []
    soup = BeautifulSoup(response.content, 'lxml')
    divs = soup.find_all('div', class_ = 'ds-p-0') # if this doesn't work use class_ = ds-relative
    for div in divs:
        for a in div.find_all('a', href = True):
            if (('/live-cricket-score' in a['href']) or ('/full-scorecard' in a['href'])):
                new_row = {
                    'url_type': 'match',
                    'season': season,
                    'series': series,
                    'match_id': a['href'].split('/')[-2].split('-')[-1],
                    'url': 'https://www.espncricinfo.com' + a['href']
                }
                new_rows.append(new_row)
    return new_rows

### Function to process series urls to match urls

In [45]:
def process_url(season, series, url):
    st = time.time()
    initial_size = len(df_url)
    link1 = url.replace(url.split('/')[-1], 'match-results') # completed series
    link2 = url.replace(url.split('/')[-1], 'match-schedule-fixtures') # ongoing, future series
    link3 = url.replace(url.split('/')[-1], 'match-schedule-results') # alternate link
    response1 = requests.get(link1)
    response2 = requests.get(link2)
    response3 = requests.get(link3)
    if response1.status_code != 404:
        new_rows = insert_match_url(season, series, response1)
    elif response2.status_code != 404:
        new_rows = insert_match_url(season, series, response2)
    elif response3.status_code != 404:
        new_rows = insert_match_url(season, series, response3)
    else:
        print("No suitable match endpoint found, Intervention required!!!")
        print(season, series, url, sep='\n', end='\n\n')
    return new_rows

### Multi-Threading function

In [50]:
num_threads = 100
num_series = len(df_url[df_url['url_type'] == 'series'])

print("wait for approx {} minutes".format((0.8*num_series//60)+1))
with concurrent.futures.ThreadPoolExecutor(max_workers = num_threads) as executor:
    futures = []
    for season, series, url in df_url[df_url['url_type'] == 'series'][['season', 'series', 'url']].values:
    #for season, series, url in test_series:    
        futures.append(executor.submit(process_url, season, series, url))

    for count, future in enumerate(concurrent.futures.as_completed(futures)):
        for row in future.result():
            df_url.loc[len(df_url)] = row
            

wait for approx 1.0 minutes


In [23]:
query = psql.insert_without_duplicate(
    engine,
    dataFrame = df_url,
    table = "espn_url",
    schema = "dwh",
    conflict_col = list(df_url.columns))