## Importing Libraries

In [107]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures

## Postgres Configuration

In [108]:
%run config_psql.ipynb

## Settings Configuration

In [109]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Getting match list and urls from database

In [123]:
query1 = """
WITH sub1 AS (	
-- list of every single match played by every single player whose attributes are not available
SELECT DISTINCT
	p.id,
	p.identifier,
	p.key_cricinfo,
	p.unique_name,
	mp.match_id,
	url.url
FROM dwh.people p
LEFT JOIN dwh.player_info pi ON p.identifier = pi.identifier
JOIN dwh.match_player mp ON p.id = mp.player_id_num
JOIN dwh.espn_url url ON mp.match_id = url.match_id
WHERE pi.identifier IS NULL AND url.url_type = 'match'
)
, sub2 AS (
SELECT 
	sub1.match_id,
	COUNT(1) AS player_count
FROM sub1
GROUP BY sub1.match_id
ORDER BY COUNT(1) DESC
)
, sub3 AS (
SELECT DISTINCT
	sub1.match_id,
	sub1.url
FROM sub1
JOIN sub2 ON sub1.match_id = sub2.match_id
)

SELECT * FROM sub3;
"""

# query2 = """
# SELECT DISTINCT
# 	p.key_cricinfo
# FROM dwh.people p
# LEFT JOIN dwh.player_info pi ON p.identifier = pi.identifier
# JOIN dwh.match_player mp ON p.id = mp.player_id_num
# """

query2 = """
SELECT DISTINCT
	p.key_cricinfo
FROM dwh.player_info p
"""

In [124]:
with engine.connect() as conn:
    df1 = pd.read_sql_query(query1, con = engine)
    df2 = pd.read_sql_query(query2, con = engine)

In [125]:
url_list = list(map(lambda url: url.replace(url.split("/")[-1], "match-playing-xi"), df1['url']))
col_list = ['key_cricinfo', 'name', 'Full Name', 'Born', 'Batting Style', 'Bowling Style', 'Fielding Position', 'Playing Role', 'Other', 'url']
df_info = pd.DataFrame(columns=col_list)

In [126]:
def find_playing_xi(url):
    player_url = []
    response = requests.get(url)
    if response.status_code == 404:
        print("Url not found")
        return None

    soup = BeautifulSoup(response.content, 'lxml')

    for row in soup.find('table').find('tbody').find_all('tr'):
        cols = len(row.find_all('td'))
        a_cols = len(row.find_all('a'))+1
        if cols != 1:
            for num in range(1,a_cols):
                # player_title = row.find_all('a')[num-1]['title']
                player_url.append(row.find_all('a')[num-1]['href'])
    
    return player_url 

In [127]:
num_threads = 50
all_urls = []

with concurrent.futures.ThreadPoolExecutor(max_workers = num_threads) as executor:
    futures = []
    for url in url_list:
        futures.append(executor.submit(find_playing_xi, url))

    for future in (concurrent.futures.as_completed(futures)):
        result = future.result()
        for url in result:
            all_urls.append(url)
all_urls = list(set(all_urls))

In [128]:
x = list(map(lambda url: url.split("/")[-1].split('-')[-1], all_urls))
y = list(set(df2['key_cricinfo'].values))

all_urls2 = []
for url in all_urls:
    if url.split("/")[-1].split('-')[-1] not in list(set(x) & set(y)):
        all_urls2.append(url)
len(all_urls2)

35

In [129]:
def find_player_info(url):
    info_dict = {}
    response = requests.get('https://www.espncricinfo.com/' + url)
    if response.status_code == 404:
        print("Url not found")
        return None

    soup = BeautifulSoup(response.content, 'lxml')

    info_dict['key_cricinfo']= url.split('/')[-1].split('-')[-1]
    info_dict['name'] = ' '.join(url.split('/')[-1].split('-')[:-1])
    info_dict['url'] = url

    if len(soup.find_all("div", {"class": "ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8"})) == 0:
        print(url)
    
    for div in soup.find_all("div", {"class": "ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8"})[0].find_all('div'):
        key_elem = div.find('p', class_='ds-text-tight-m')
        value_elem = div.find('span', class_='ds-text-title-s')
        if key_elem and value_elem:
            key = key_elem.text.strip()
            value = value_elem.text.strip()
            info_dict[key] = value
            info_dict = {col: info_dict[col] if col in info_dict else np.nan for col in col_list}
        # df_result.loc[len(df_result)] = info_dict
    
    return info_dict 

In [130]:
num_threads = 30

with concurrent.futures.ThreadPoolExecutor(max_workers = num_threads) as executor:
    futures = []
    for url in all_urls2:
        futures.append(executor.submit(find_player_info, url))

    for future in (concurrent.futures.as_completed(futures)):
        result = future.result()
        df_info.loc[len(df_info)] = result

In [131]:
df_info[['dob', 'birth_place']] = df_info['Born'].str.extract(r'([A-Za-z]+ \d{2}, \d{4}),?(.*)')
df_info['dob'] = pd.to_datetime(df_info['dob']).dt.strftime('%Y-%m-%d')
df_info['dob'] = pd.to_datetime(df_info['dob'])
df_info['birth_place'] = df_info['birth_place'].replace('', np.nan)
df_info.drop(columns=['Born'], inplace=True)
df_info.dropna(subset=['Batting Style', 'Bowling Style', 'Other'], how='all', inplace=True)
df_info['Bowling Style'] = df_info['Bowling Style'].str.split(',').str[0].str.strip()
df_info['bowling_hand'] = np.where(df_info['Bowling Style'].str.contains('Left', case=False), 'Left', np.where(df_info['Bowling Style'].str.contains('Right', case=False), 'Right', None))

df_info = df_info.rename(columns={'Full Name': 'full_name', 'Batting Style': 'batting_style', 'Bowling Style': 'bowling_style', 'Fielding Position': 'fielding_pos', 'Playing Role': 'playing_role', 'Other': 'other'})

In [135]:
# Load player information into Stage table
with engine.connect() as conn:
    conn.execute("TRUNCATE TABLE stg.player_info")

count_rows = df_info.to_sql('player_info', schema = 'stg', con = engine, if_exists='append', method = 'multi', index = False)

In [136]:
with engine.connect() as conn:
    conn.execution_options(isolation_level = "AUTOCOMMIT")
    with conn.begin():
        conn.execute("CALL dwh.LoadPlayerInfo()") 

In [132]:
df_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 0 to 33
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   key_cricinfo   20 non-null     object        
 1   name           20 non-null     object        
 2   full_name      20 non-null     object        
 3   batting_style  20 non-null     object        
 4   bowling_style  19 non-null     object        
 5   fielding_pos   1 non-null      object        
 6   playing_role   6 non-null      object        
 7   other          0 non-null      float64       
 8   url            20 non-null     object        
 9   dob            20 non-null     datetime64[ns]
 10  birth_place    17 non-null     object        
 11  bowling_hand   20 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(10)
memory usage: 2.0+ KB


In [247]:
df_info

Unnamed: 0,key_cricinfo,name,full_name,batting_style,bowling_style,fielding_pos,playing_role,other,url,dob,birth_place,bowling_hand
0,1420171,fathimath anaal,Fathimath Anaal Mohammed,Right hand Bat,Right arm Medium fast,,,,/cricketers/fathimath-anaal-1420171,2005-08-13,,Right
1,450860,adam milne,Adam Fraser Milne,Right hand Bat,Right arm Fast,,Bowler,,/cricketers/adam-milne-450860,1992-04-13,Palmerston North,Right
2,1112537,jordan cox,Jordan Matthew Cox,Right hand Bat,,Wicketkeeper,Wicketkeeper Batter,,/cricketers/jordan-cox-1112537,2000-10-21,"Margate, Kent",Left
4,1171471,theint soe,Theint Theint Soe,Right hand Bat,Right arm Offbreak,,,,/cricketers/theint-soe-1171471,2000-12-08,,Right
5,1171483,htet aung,Htet Htet Aung,Right hand Bat,Right arm Medium,,,,/cricketers/htet-aung-1171483,1998-10-31,,Right
6,348044,lakshit gupta,Lakshit Gupta,Right hand Bat,Right arm Offbreak,,,,/cricketers/lakshit-gupta-348044,1998-10-06,,Right
7,543550,hannah rowe,Hannah Maree Rowe,Right hand Bat,Right arm Medium,,Bowler,,/cricketers/hannah-rowe-543550,1996-10-03,Palmerston North,Right
8,1287033,gurnoor brar,Gurnoor Brar,Left hand Bat,Right arm Fast,,Bowler,,/cricketers/gurnoor-brar-1287033,2000-05-25,"Muktsar, Punjab",Right
9,568276,imam ul haq,Imam-ul-Haq,Left hand Bat,,,Top order Batter,,/cricketers/imam-ul-haq-568276,1995-12-22,"Lahore, Punjab",Left
10,221140,david wiese,David Wiese,Right hand Bat,Right arm Medium fast,,Allrounder,,/cricketers/david-wiese-221140,1985-05-18,Roodepoort,Right


In [13]:
x = []
len(x)

0