## Importing Libraries

In [94]:
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import time
import concurrent.futures
import modules.psql as psql

## Postgres Configuration

In [95]:
%run config_psql.ipynb

## Setting Configurations

In [96]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Getting match list and urls from database

In [97]:
query = """
DROP TABLE IF EXISTS cte;

CREATE TEMPORARY TABLE cte (
	match_id TEXT
);

WITH sub1 AS
(
	SELECT DISTINCT 
		match_id 
	FROM dwh.match_player AS mp
	WHERE NOT EXISTS (
		SELECT 1 
		FROM dwh.match_player 
		WHERE match_id = mp.match_id 
		AND (is_captain OR is_wicketkeeper))
)
,sub2 AS (
	SELECT 
		m.match_id
	FROM dwh.match m
	JOIN sub1 ON m.match_id = sub1.match_id
)
INSERT INTO cte
SELECT * FROM sub2;

SELECT sub3.match_id, sub3.url
FROM
(
    SELECT 
        cte.match_id, eu.url, ROW_NUMBER() OVER (PARTITION BY cte.match_id) AS rk
    FROM cte
    LEFT JOIN dwh.espn_url eu ON cte.match_id = eu.match_id
    WHERE eu.url_type = 'match'
) sub3
WHERE sub3.rk <= 1
LIMIT 1000;
"""

In [98]:
with engine.connect() as conn:
    df = pd.read_sql_query(query,con = engine)

## Initializing Parameters

In [99]:
url_list = list(map(lambda url: url.replace(url.split("/")[-1], "match-playing-xi"), df['url']))
df_result = pd.DataFrame(columns=['match_id','team','wicket_keeper','captain'])

In [102]:
def find_wk_c(url):
    response = requests.get(url, timeout = 5)
    if response.status_code == 404:
        print("Url not found")
        return None
        
    soup = BeautifulSoup(response.content, 'lxml')
    if soup.find('table') is None or soup is None:
        # print("Soup None - {}".format(url))
        return None
    
    team = {
        'match_id' : url.split('/')[-2].split('-')[-1],
        '1_name' : soup.find('table').find('thead').find_all('th')[1].text,
        '2_name' : soup.find('table').find('thead').find_all('th')[2].text
    }
    for row in soup.find('table').find('tbody').find_all('tr'):
        cols = len(row.find_all('td'))
        a_cols = len(row.find_all('a'))+1
        if cols != 1:
            for num in range(1,a_cols):
                player_title = row.find_all('a')[num-1]['title']
                if ((chr(8224) in player_title) or ('(WK)' in player_title) or ('(wk)' in player_title)):
                    team[str(num) + '_wicket_keeper'] = row.find_all('a')[num-1]['href'].split('/')[-1].split('-')[-1]
                if (('(C)' in player_title) or ('(c)' in player_title) or '(captain)' in player_title):
                    team[str(num) + '_captain'] = row.find_all('a')[num-1]['href'].split('/')[-1].split('-')[-1]
        else:
            break
    return team

In [101]:
num_threads = 50
num_series = len(df)
count = 0

with concurrent.futures.ThreadPoolExecutor(max_workers = num_threads) as executor:
    futures = []
    for url in url_list:
        futures.append(executor.submit(find_wk_c, url))

    for future in (concurrent.futures.as_completed(futures)):
        result = future.result()
        #team 1 record
        record = {
            'match_id': result.get('match_id'),
            'team': result.get('1_name'),
            'wicket_keeper': result.get('1_wicket_keeper'),
            'captain': result.get('1_captain')
        }
        df_result.loc[len(df_result)] = record
        #team 2 record
        record = {
            'match_id': result.get('match_id'),
            'team': result.get('2_name'),
            'wicket_keeper': result.get('2_wicket_keeper'),
            'captain': result.get('2_captain')
        }            
        df_result.loc[len(df_result)] = record
        count += 1
        # print("{} executed!".format(result.get('match_id')))
        if (count%50)==0:
            print("{0} matches completed!".format(count))

50 matches completed!
100 matches completed!
150 matches completed!
200 matches completed!
250 matches completed!
300 matches completed!
350 matches completed!
400 matches completed!
450 matches completed!
500 matches completed!
550 matches completed!
600 matches completed!
650 matches completed!
700 matches completed!
750 matches completed!
800 matches completed!
850 matches completed!
900 matches completed!


In [104]:
# Load match information into Stage table
with engine.connect() as conn:
    conn.execute("TRUNCATE TABLE stg.match_wk_c")

count_rows = df_result.to_sql('match_wk_c', schema = 'stg', con = engine, if_exists='append', method = 'multi', index = False)

In [105]:
with engine.connect() as conn:
    conn.execution_options(isolation_level = "AUTOCOMMIT")
    with conn.begin():
        conn.execute("CALL dwh.LoadWk_C()") 