## Importing Libraries

In [264]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures

## Postgres Configuration

In [265]:
%run config_psql.ipynb

## Settings Configuration

In [266]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Getting match list and urls from database

In [267]:
query1 = """
SELECT s1.id, p.key_cricinfo, s2.match_id, s2.url
FROM
(
	SELECT distinct id FROM dwh.people --14882
	INTERSECT
	SELECT distinct player_id_num FROM dwh.match_player -- 2286
) s1
INNER JOIN 
(
	SELECT MAX(mp.match_id) AS match_id, MAX(eu.url) AS url, player_id_num 
	FROM dwh.match_player mp 
	INNER JOIN dwh.espn_url eu ON mp.match_id = eu.match_id AND eu.url_type = 'match'
	GROUP BY player_id_num
) s2
ON s1.id = s2.player_id_num
LEFT JOIN dwh.player_info pi ON s1.id = pi.id
INNER JOIN dwh.people p ON s1.id = p.id
WHERE pi.id IS NULL
"""

In [268]:
with engine.connect() as conn:
    df1 = pd.read_sql_query(query1, con = engine)

In [269]:
url_list = list(map(lambda url: url.replace(url.split("/")[-1], "match-playing-xi"), df1['url'].unique()))
col_list = ['key_cricinfo', 'name', 'Full Name', 'Born', 'Batting Style', 'Bowling Style', 'Fielding Position', 'Playing Role', 'Other', 'url']
df_info = pd.DataFrame(columns=col_list)

In [270]:
def find_playing_xi(url):
    player_url = []
    response = requests.get(url)
    if response.status_code == 404:
        print("Url not found")
        return None

    soup = BeautifulSoup(response.content, 'lxml')

    for row in soup.find('table').find('tbody').find_all('tr'):
        cols = len(row.find_all('td'))
        a_cols = len(row.find_all('a'))+1
        if cols != 1:
            for num in range(1,a_cols):
                # player_title = row.find_all('a')[num-1]['title']
                player_url.append(row.find_all('a')[num-1]['href'])
    
    return player_url 

In [271]:
num_threads = 50
all_urls = []

with concurrent.futures.ThreadPoolExecutor(max_workers = num_threads) as executor:
    futures = []
    for url in url_list:
        futures.append(executor.submit(find_playing_xi, url))

    for future in (concurrent.futures.as_completed(futures)):
        result = future.result()
        for url in result:
            all_urls.append(url)
all_urls = list(set(all_urls))

In [272]:
x = list(map(lambda url: url.split("/")[-1].split('-')[-1], all_urls))
y = list(set(df1['key_cricinfo'].values))

all_urls2 = []
for url in all_urls:
    if url.split("/")[-1].split('-')[-1] in list(set(x) & set(y)):
        all_urls2.append(url)

In [273]:
def find_player_info(url):
    info_dict = {}
    response = requests.get('https://www.espncricinfo.com/' + url)
    if response.status_code == 404:
        print("Url not found")
        return None

    soup = BeautifulSoup(response.content, 'lxml')

    info_dict['key_cricinfo']= url.split('/')[-1].split('-')[-1]
    info_dict['name'] = ' '.join(url.split('/')[-1].split('-')[:-1])
    info_dict['url'] = url

    if len(soup.find_all("div", {"class": "ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8"})) == 0:
        print(url)
    
    for div in soup.find_all("div", {"class": "ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8"})[0].find_all('div'):
        key_elem = div.find('p', class_='ds-text-tight-m')
        value_elem = div.find('span', class_='ds-text-title-s')
        if key_elem and value_elem:
            key = key_elem.text.strip()
            value = value_elem.text.strip()
            info_dict[key] = value
            info_dict = {col: info_dict[col] if col in info_dict else np.nan for col in col_list}
        # df_result.loc[len(df_result)] = info_dict
    
    return info_dict 

In [274]:
num_threads = 30

with concurrent.futures.ThreadPoolExecutor(max_workers = num_threads) as executor:
    futures = []
    for url in all_urls2[150:250]:
        futures.append(executor.submit(find_player_info, url))

    for future in (concurrent.futures.as_completed(futures)):
        result = future.result()
        df_info.loc[len(df_info)] = result

In [275]:
df_info[['dob', 'birth_place']] = df_info['Born'].str.extract(r'([A-Za-z]+ \d{2}, \d{4}),?(.*)')
df_info['dob'] = pd.to_datetime(df_info['dob']).dt.strftime('%Y-%m-%d')
df_info['dob'] = pd.to_datetime(df_info['dob'])
df_info['birth_place'] = df_info['birth_place'].replace('', np.nan)
if not df_info['Bowling Style'].isnull().all():
    df_info['Bowling Style'] = df_info['Bowling Style'].str.split(',').str[0].str.strip()
    df_info['bowling_hand'] = np.where(df_info['Bowling Style'].str.contains('Left', case=False), 'Left', np.where(df_info['Bowling Style'].str.contains('Right', case=False), 'Right', None))
df_info.drop(columns=['Born'], inplace=True)
df_info.dropna(subset=['Batting Style', 'Bowling Style', 'Other'], how='all', inplace=True)

df_info = df_info.rename(columns={'Full Name': 'full_name', 'Batting Style': 'batting_style', 'Bowling Style': 'bowling_style', 'Fielding Position': 'fielding_pos', 'Playing Role': 'playing_role', 'Other': 'other'})

In [276]:
# Load player information into Stage table
with engine.connect() as conn:
    conn.execute("TRUNCATE TABLE stg.player_info")

count_rows = df_info.to_sql('player_info', schema = 'stg', con = engine, if_exists='append', method = 'multi', index = False)

In [277]:
with engine.connect() as conn:
    conn.execution_options(isolation_level = "AUTOCOMMIT")
    with conn.begin():
        conn.execute("CALL dwh.LoadPlayerInfo()") 