In [54]:
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import time
import concurrent.futures

In [55]:
%run config_psql.ipynb

In [56]:
min_year = 2024 #Parameterize this for receiving future incremental links
max_year = datetime.date.today().year

df_url = pd.DataFrame(columns=['site','url_type','season','series','match_id','match_attr','url'])

In [57]:
for year in range(min_year,max_year+1):
    new_row = {
        'site': 'ESPNCricInfo',
        'url_type': 'season',
        'season': str(year),
        'url': "https://www.espncricinfo.com/ci/engine/series/index.html?season={};view=season".format(str(year))
    }
    df_url.loc[len(df_url)] = new_row
    
    if year < max_year:
        season_name = str(year) + "%2F" + '{:02d}'.format(year-1999)
        new_row = {
            'site': 'ESPNCricInfo',
            'url_type': 'season',
            'season': season_name,
            'url': "https://www.espncricinfo.com/ci/engine/series/index.html?season={};view=season".format(season_name)
        }
        df_url.loc[len(df_url)] = new_row
df_url.head()

Unnamed: 0,site,url_type,season,series,match_id,match_attr,url
0,ESPNCricInfo,season,2024,,,,https://www.espncricinfo.com/ci/engine/series/...


In [58]:
for season in df_url['season'].values:
    season_engine_url = "https://www.espncricinfo.com/ci/engine/series/index.html?season={};view=season".format(season)
    response = requests.get(season_engine_url)
    soup = BeautifulSoup(response.content, 'lxml')
    for a in soup.find_all('a', href = True):
        if (('https://www.espncricinfo.com/series/' in a['href']) and ('/match-schedule-fixtures' in a['href'])):
            new_row = {
                'site': 'ESPNCricInfo',
                'url_type': 'series',
                'season': season,
                'series': a.text,
                'url': a['href']
            }
            df_url.loc[len(df_url)] = new_row
            df_url.head()
df_url.head()

Unnamed: 0,site,url_type,season,series,match_id,match_attr,url
0,ESPNCricInfo,season,2024,,,,https://www.espncricinfo.com/ci/engine/series/...
1,ESPNCricInfo,series,2024,ICC World Test Championship,,,https://www.espncricinfo.com/series/icc-world-...
2,ESPNCricInfo,series,2024,Botham-Richards Trophy,,,https://www.espncricinfo.com/series/botham-ric...
3,ESPNCricInfo,series,2024,Sri Lanka in England Test Series,,,https://www.espncricinfo.com/series/sri-lanka-...
4,ESPNCricInfo,series,2024,Australia in England ODI Series,,,https://www.espncricinfo.com/series/australia-...


## Normal Approach

In [20]:
def insert_match_url(season, series, response):
    new_rows = []
    soup = BeautifulSoup(response.content, 'lxml')
    divs = soup.find_all('div', class_ = 'ds-p-0')
    for div in divs:
        for a in div.find_all('a', href = True):
            if (('/live-cricket-score' in a['href']) or ('/full-scorecard' in a['href'])):
                new_row = {
                    'site': 'ESPNCricInfo',
                    'url_type': 'match',
                    'season': season,
                    'series': series,
                    'match_id': a['href'].split('/')[-2].split('-')[-1],
                    'url': 'https://www.espncricinfo.com' + a['href']
                }
                new_rows.append(new_row)
    return new_rows

In [8]:
for season, series, url in df_url[df_url['url_type']=='series'][['season', 'series', 'url']].values[:50]: #first2 only for testing
    st = time.time()
    initial_size = len(df_url)
    link1 = url.replace(url.split('/')[-1], 'match-results') # completed series
    link2 = url.replace(url.split('/')[-1], 'match-schedule-fixtures') # ongoing, future series
    link3 = url.replace(url.split('/')[-1], 'match-schedule-results') # alternate link
    response1 = requests.get(link1)
    response2 = requests.get(link2)
    response3 = requests.get(link3)
    if response1.status_code != 404:
        new_rows = insert_match_url(season, series, response1)
        for row in new_rows:
            df_url.loc[len(df_url)] = row
    elif response2.status_code != 404:
        new_rows = insert_match_url(season, series, response2)
        for row in new_rows:
            df_url.loc[len(df_url)] = row
    elif response3.status_code != 404:
        new_rows = insert_match_url(season, series, response3)
        for row in new_rows:
            df_url.loc[len(df_url)] = row
    else:
        print("No suitable match endpoint found, Intervention required!!!")
        print(season, series, url, sep = '\n', end = '\n\n')
        break

    #print(len(df_url), initial_size, sep = ' | ')
    et = time.time()
    print("Series completed, {0} matches found. time elapsed : {1} seconds".format((len(df_url) - initial_size), (et-st)))

Series completed, 70 matches found. time elapsed : 3.5479557514190674 seconds
Series completed, 2 matches found. time elapsed : 3.0928022861480713 seconds
Series completed, 1 matches found. time elapsed : 3.172288417816162 seconds
Series completed, 1 matches found. time elapsed : 3.2746667861938477 seconds
Series completed, 35 matches found. time elapsed : 3.3939898014068604 seconds
Series completed, 5 matches found. time elapsed : 3.9827942848205566 seconds
Series completed, 2 matches found. time elapsed : 3.032554864883423 seconds
Series completed, 2 matches found. time elapsed : 3.1147968769073486 seconds
Series completed, 155 matches found. time elapsed : 4.474671125411987 seconds
Series completed, 29 matches found. time elapsed : 3.0703208446502686 seconds
Series completed, 5 matches found. time elapsed : 3.341822862625122 seconds
Series completed, 3 matches found. time elapsed : 3.298501491546631 seconds
Series completed, 3 matches found. time elapsed : 2.930828332901001 seconds


## Multi-Threading

In [52]:
import concurrent.futures
import requests
import pandas as pd
import time

# Define your insert_match_url function if not already defined
def insert_match_url(season, series, response):
    new_rows = []
    soup = BeautifulSoup(response.content, 'lxml')
    divs = soup.find_all('div', class_ = 'ds-p-0')
    for div in divs:
        for a in div.find_all('a', href = True):
            if (('/live-cricket-score' in a['href']) or ('/full-scorecard' in a['href'])):
                new_row = {
                    'site': 'ESPNCricInfo',
                    'url_type': 'match',
                    'season': season,
                    'series': series,
                    'match_id': a['href'].split('/')[-2].split('-')[-1],
                    'url': 'https://www.espncricinfo.com' + a['href']
                }
                new_rows.append(new_row)
    return new_rows

# Assuming df_url is your DataFrame containing the data
# Define your DataFrame df_url

def process_url(season, series, url):
    st = time.time()
    initial_size = len(df_url)
    link1 = url.replace(url.split('/')[-1], 'match-results') # completed series
    link2 = url.replace(url.split('/')[-1], 'match-schedule-fixtures') # ongoing, future series
    link3 = url.replace(url.split('/')[-1], 'match-schedule-results') # alternate link
    response1 = requests.get(link1)
    response2 = requests.get(link2)
    response3 = requests.get(link3)
    if response1.status_code != 404:
        new_rows = insert_match_url(season, series, response1)
    elif response2.status_code != 404:
        new_rows = insert_match_url(season, series, response2)
    elif response3.status_code != 404:
        new_rows = insert_match_url(season, series, response3)
    else:
        print("No suitable match endpoint found, Intervention required!!!")
        print(season, series, url, sep='\n', end='\n\n')
    return new_rows
    
# Define the number of threads you want to use
num_threads = 25

# Use ThreadPoolExecutor to create a pool of threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit tasks to the executor
    futures = []
    for season, series, url in df_url[df_url['url_type'] == 'series'][['season', 'series', 'url']].values[:50]:
        futures.append(executor.submit(process_url, season, series, url))

    # Wait for all futures (threads) to complete
    for future in concurrent.futures.as_completed(futures):
        for row in future.result():
            df_url.loc[len(df_url)] = row

In [53]:
df_url.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1552 entries, 0 to 1551
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   site        1552 non-null   object 
 1   url_type    1552 non-null   object 
 2   season      1552 non-null   object 
 3   series      1549 non-null   object 
 4   match_id    761 non-null    object 
 5   match_attr  0 non-null      float64
 6   url         1552 non-null   object 
dtypes: float64(1), object(6)
memory usage: 97.0+ KB


In [46]:
df_url[df_url['url_type'] == 'match']

Unnamed: 0,site,url_type,season,series,match_id,match_attr,url


In [42]:
futures[0].result()

[{'site': 'ESPNCricInfo',
  'url_type': 'match',
  'season': '2023',
  'series': 'ICC World Test Championship',
  'match_id': '1239543',
  'url': 'https://www.espncricinfo.com/series/india-tour-of-england-2021-2022-1239527/england-vs-india-1st-test-1239543/full-scorecard'},
 {'site': 'ESPNCricInfo',
  'url_type': 'match',
  'season': '2023',
  'series': 'ICC World Test Championship',
  'match_id': '1239544',
  'url': 'https://www.espncricinfo.com/series/india-tour-of-england-2021-2022-1239527/england-vs-india-2nd-test-1239544/full-scorecard'},
 {'site': 'ESPNCricInfo',
  'url_type': 'match',
  'season': '2023',
  'series': 'ICC World Test Championship',
  'match_id': '1263169',
  'url': 'https://www.espncricinfo.com/series/pakistan-tour-of-west-indies-2021-1263146/west-indies-vs-pakistan-1st-test-1263169/full-scorecard'},
 {'site': 'ESPNCricInfo',
  'url_type': 'match',
  'season': '2023',
  'series': 'ICC World Test Championship',
  'match_id': '1263170',
  'url': 'https://www.espncri

In [12]:
df_url_single.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1552 entries, 0 to 1551
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   site        1552 non-null   object 
 1   url_type    1552 non-null   object 
 2   season      1552 non-null   object 
 3   series      1549 non-null   object 
 4   match_id    761 non-null    object 
 5   match_attr  0 non-null      float64
 6   url         1552 non-null   object 
dtypes: float64(1), object(6)
memory usage: 97.0+ KB


In [13]:
df_url.info()

<class 'pandas.core.frame.DataFrame'>
Index: 981 entries, 0 to 980
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   site        981 non-null    object 
 1   url_type    981 non-null    object 
 2   season      981 non-null    object 
 3   series      978 non-null    object 
 4   match_id    190 non-null    object 
 5   match_attr  0 non-null      float64
 6   url         981 non-null    object 
dtypes: float64(1), object(6)
memory usage: 93.6+ KB


In [14]:
df_url_single.to_excel('df_url_single_processing.xlsx')

In [None]:
series_list = df_url[df_url['url_type']=='series'][['season', 'series', 'url']]
series_list = ['https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856/match-schedule-fixtures'] #for testing

In [None]:
#match_set = {}
for batch in series_list:
    #for series in series_set[batch]:
    for series in series_set[batch][2:3]:
        link1 = series[0].replace(series[0].split('/')[-1], 'match-results') # completed series
        link2 = series[0].replace(series[0].split('/')[-1], 'match-schedule-fixtures') # ongoing, future series
        link3 = series[0].replace(series[0].split('/')[-1], 'match-schedule-results') # alternate link
        response1 = requests.get(link1)
        response2 = requests.get(link2)
        if response1.status_code != 404:
            series_completed = True
            soup = BeautifulSoup(response1.content, 'lxml')
            divs = soup.find_all('div', class_='ds-p-0') #if this doesnt work use class "ds-relative"
            for div in divs:
                for a in div.find_all('a', href = True):
                    if (('/live-cricket-score' in a['href']) or ('/full-scorecard' in a['href'])):
                        match_set = {
                            'site': 'ESPNCricInfo',
                            'url_type': 'match',
                            'season': 
                        }
                match_set[series[0] + '_matches'] = [a['href'] for a in div.find_all('a', href = True) if ('/full-scorecard' in a['href'] and ]
        elif:
        #     response = requests.get(link2)
        #     series_completed = False
        #     soup = BeautifulSoup(response.content, 'lxml')
            
        #working_url = link1 if series_completed else link2        

In [None]:
series_set['2024_series_set']

In [None]:
[a.text for a in soup.find_all('a', href = True) if '/full-scorecard' in a['href']]

In [None]:
soup.find_all('a', href = True)

In [None]:
#mydivs = soup.find_all("div", {"class": "ds-relative"})
#mydivs = soup.find_all("div", {"class": "ds-p-0"})
specific_div = soup.find_all('div', class_='ds-p-0')

In [None]:
for div in specific_div:
    #print([a['href'] for a in div.find_all('a', href = True) if (('/live-cricket-score' in a['href']) or ('/full-scorecard' in a['href']))])
    for a in div.find_all('a', href = True):
        if (('/live-cricket-score' in a['href']) or ('/full-scorecard' in a['href'])):
            print(a['href'])

In [None]:
link1.split("/")[-2].split("-")[-1]

In [None]:
for div in specific_div:
    print(div)

In [None]:
link1

In [None]:
"https://www.espncricinfo.com/series/sri-lanka-in-england-2024-1385672/england-vs-sri-lanka-1st-test-1385694/live-cricket-score"

In [None]:
soup

In [None]:
list(df_url[df_url['url_type'] == 'series'][['url','series']].iloc[2])

In [None]:
# Reliving the horrifying world cup memories once again

#series_list = list(df_url[df_url['url_type'] == 'series']['url'])
series_list = ['https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856/match-schedule-fixtures']

In [None]:
# If series is completed the endpoint is 'match-results'
# else the endpoint is 'match-schedule-fixtures'
match_set = {}
for series_url in series_list:
    link1 = series_list[0].replace(series_list[0].split('/')[-1], 'match-results')
    link2 = series_list[0].replace(series_list[0].split('/')[-1], 'match-schedule-results')
    response = requests.get(link1)
    if response.status_code != 404:
        series_completed = True
        soup = BeautifulSoup(response.content, 'lxml')
        match_set[str()] = [a['href'] for a in soup.find_all('a', href = True) if '/'.join(link1.split('/')[3:-1]) in a['href'] and '/full-scorecard' in a['href']]
    else:
        response = requests.get(link2)
        series_completed = False
        soup = BeautifulSoup(response.content, 'lxml')
        
    working_url = link1 if series_completed else link2

In [None]:
soup.find_all('a', href = True)

In [None]:
desired_url = '/'.join(series_list[0].split('/')[3:-1])
desired_url

In [None]:
series_list[0].split('/')

In [None]:
x = [a['href'] for a in soup.find_all('a', href = True) if '/'.join(link1.split('/')[3:-1]) in a['href'] and '/full-scorecard' in a['href']]
x

In [None]:
for season in series_set.keys():
    for series in series_set[season]:
        #print(series)

In [None]:
for series in series_set['2024_series_set']:
    print(series[0])