In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
import json
import time
import re
from datetime import datetime

### init browser

In [2]:
# return a filename (with path) such that it is accessible. 
def get_fullname(filename):
    # as long as it is accessible.
    if os.path.isfile(filename):
        return filename

    # search current working directory.
    cur_working_dir = os.getcwd()
    for i in range(50):
        for (path, dir, files) in os.walk(cur_working_dir):
            if filename in files:
                return os.path.join(path, filename)

        # check parent directory
        parent = os.path.dirname(cur_working_dir)
        if cur_working_dir == parent:
            break;
        cur_working_dir = parent

    # did not found, simply return.
    print(f"file {filename} not found !!!")
    return filename


# Initiate headless driver for deployment
browser = Browser("chrome", executable_path=get_fullname("chromedriver.exe"), headless=True)
#browser = Browser('chrome', executable_path=get_fullname("chromedriver.exe"))
time.sleep(0.1)

In [3]:
# base url
hltv_url = 'https://www.hltv.org'

In [4]:
key_perf_table = 'perf_table'
key_AWP_kills = 'AWP_kills'
## top level
key_timestamp = 'ts'
key_map_stats = 'map_stats'
key_teams = 'teams'
key_player_st = 'match_player_st'

## visist '/stats/matches'

In [5]:
# href="/stats/teams/4471/bemyfRAG"
# href="/stats/players/15090/PwnAlone"
def get_team_id(a):
    sl = a['href'].split('/')
    return sl[3]
get_player_id = get_team_id

def get_team_name(a):
    sl = a['href'].split('/')
    return sl[4]
get_player_name = get_team_name

#### only timestamp, map (if single map) and href are used later.

In [6]:
def match_list_get(browser, start, stop, next_url = '/stats/matches'):
    # start is the newer match data, larger value
    if start < stop:
        t = stop
        stop = start
        start = t
        
    # Visit the site to Scrape
    ts = 0
    all_done = False
    match_urls = []

    for cnt in range(10000):
        # Visit the site to Scrape
        browser.visit(hltv_url + next_url)
        browser.is_element_present_by_tag('div.contentCol', wait_time=3)
        content = BeautifulSoup(browser.html, 'html.parser').find('div', attrs={"class": "contentCol"})

        # all matches in this page
        trs = content.tbody.find_all('tr')

        for tr in trs:
            match_info= {} 

            # data-unix
            td = tr.find('td', attrs={'class': 'date-col'})
            t = int(td.div['data-unix'])/1000
            if (t > start):
                continue
            if (t < stop):
                all_done = True
                break

            match_info['href'] = td.a['href']
            match_info['ts'] = t
            # map
            match_info['map'] = tr.find('div', attrs={'class': 'dynamic-map-name-full'}).text
            match_urls.append(match_info)        

        # keep current page
        # if in middle of the page, may overlapping, prevent missing matches
        if all_done:
            break
        # ========================================================================
        # next page
        # ========================================================================
        # next page
        a = content.find('a', attrs={"class": "pagination-next"}, href=True)
        if not a:
            print(f'Done {len(match_urls)} matches scraping.')
            break
        next_url = a['href']

        # only during development
        if cnt == 1000:
            print(f'Truncate the match lists !!!!!')
            print(f'Change next_url from: {next_url}')        
            next_url = '/stats/matches?offset=79700'
            print(f'                  to: {next_url}')  
    
    # return all urls and next_page_url
    return match_urls, next_url

#### player stat.

In [7]:
def percentage_get_(str):
    # 'nn.nn%'
    nums = re.findall('(\d*\.\d*\%)', str)
    if (nums):
        return float(nums[0].strip('%'))/100
    # 'nn.nn'
    nums = re.findall('(\d*\.\d*)', str)
    if (nums):
        return float(nums[0])
    # 'nn'
    nums = re.findall('(\d+)', str)    
    if (nums):
        return float(nums[0])
    return 0

# kratio
def kdratio_get(str):
    return round(percentage_get_(str), 4)

# assists
def assists_get(str):
    nums = re.findall("\d+", str)
    if len(nums) == 0:
        return '0', '0'
    elif len(nums) == 1:
        return nums[0], '0'
    return nums[0], nums[1]
# kills
def kills_get(str):
    return assists_get(str)

# mostly for 'adr'
def num_get(str):
    nums = re.findall("\d+", str)
    if len(nums):
        return nums[0]
    return '0'

In [8]:
# parse player stat.
def player_stat(player_tr):
#    items = ['player', 'kills', 'assists', 'deaths', 'kdratio', 'kddiff', 'adr', 'fkdiff', 'adr']
    player_st = {}
    # player
    st_player = player_tr.find('td', attrs={"class": "st-player"})
    player_id = get_player_id(st_player.a)
    player_st['player_id'] = player_id
    player_st['player'] = get_player_name(st_player.a) # st_player.text
    # parse 'kills'
    str = player_tr.find('td', attrs={"class": "st-kills"}).text
    player_st['kills'], player_st['hs'] = kills_get(str)
    # parse 'assists'
    str = player_tr.find('td', attrs={"class": "st-assists"}).text
    player_st['assists'], player_st['flash_assists'] = assists_get(str)
        
    # parse 'kdratio', remove '%' to float.
    player_st['kdratio'] = kdratio_get(str)
    # 'adr'
    item = 'adr'
    str = player_tr.find('td', attrs={"class": "st-" + item}).text
    player_st[item] = num_get(str)
    # parse all others.
    items = ['deaths', 'kddiff', 'fkdiff', 'rating']
    for item in items:
        player_st[item] = player_tr.find('td', attrs={"class": "st-" + item}).text

    return {player_id : player_st}

def team_stat_scrape(st_table):
    # 
    teamname = st_table.find("th", attrs={"class": "st-teamname"}).text

    # team member st.
    player_stat_table = {}
    for tr in st_table.find("tbody").find_all("tr"):
        player_stat_table.update(player_stat(tr))

    # build team/teammate map
    teammates = {}
    for id_ in player_stat_table:
        # insert 'teamname'
        player_stat_table[id_]['teamname'] = teamname
        player = {id_: player_stat_table[id_]['player']}
        teammates.update(player)
    
    # {teamname: {{id: name}, .., {id: name}}
    team = {teamname : teammates}
    return team, player_stat_table

def stats_table_scrape(stats_table):
    ### team
    teams = []
    player_stat_table = {}
    # print(stats_table)
    for stat_table in stats_table:
        team, player_st = team_stat_scrape(stat_table)
        teams.append(team)
        player_stat_table.update(player_st)

    #display(pd.read_json(json.dumps(teams[0]), orient='index'))
    #display(pd.read_json(json.dumps(teams[1]), orient='index'))
    #display(pd.read_json(json.dumps(player_stat_table), orient='index'))
    return teams, player_stat_table

def map_match_scrape(contentCol):
    mapstat = {}
    # mapstatid
    stats_table = contentCol.find_all('table', attrs={'class': 'stats-table'})
    teams, player_st = stats_table_scrape(stats_table)
    mapstat['team'] = teams
    mapstat['stat'] = player_st
    return mapstat

### entrance of match scrape

In [9]:
  
def match_scrape(brower, match_info):
    start_time = time.time()
    match_result = {}
    # hold all output scrape results.
    match_result.update(match_info)
    # print(match_result)

    # match performace main page
    wait_start = time.time()
    browser.visit(hltv_url + match_info['href'])
    browser.is_element_present_by_css("div.contentCol", wait_time=3)
    time.sleep(0.1)
    wait_time = time.time() - wait_start
    contentCol = BeautifulSoup(browser.html, 'html.parser')       \
                 .find('div', attrs={'class': 'contentCol'})

    # 'match_id' from 'match-page-link'
    match_page_link = contentCol.find('a', attrs={'class': 'match-page-link'})
    match_result['match_id'] = re.findall('/(\d+)/', match_page_link['href'])[0]
        
    # get all map's 'href'
    stats_match_maps = contentCol.find('div', attrs={'class': 'stats-match-maps'})
    as_ = []
    if stats_match_maps:
        as_ = stats_match_maps.find_all('a', href=True)

    perf_mapstat = []   
    if len(as_) == 0:
        print(f'single map')
    
        mapstat = {}
        mapstat['map'] = match_info['map']
        left = contentCol.find('div', attrs={'class': 'team-left'})
        right = contentCol.find('div', attrs={'class': 'team-right'})
        mapstat['total_rounds'] = int(left.div.text) + int(right.div.text)
        if (int(left.div.text) >= int(right.div.text)):
            winner = left
        else:
            winner = right
        mapstat['winner'] = get_team_name(winner.a)
        mapstat['winner_id'] = get_team_id(winner.a)
        mapstat['win_rounds'] = winner.div.text

        # mapstatid
        mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", match_info['href'])[0]
        # scrape all others
        mapstat.update(map_match_scrape(contentCol))
        perf_mapstat.append(mapstat)
    else:
        print(f'multi-map match')
        # get all map's 'href', 'score' and 'map'
        match_map_results = []
        for a in as_:
            map_res = {}
            map_res['href'] = a['href']
            scores = a.find('div', attrs={'class': 'stats-match-map-result-score'}).text
            nums = re.findall('\d+', scores)
            map_res['total_rounds'] = int(nums[0]) + int(nums[1])
            if (int(nums[0]) >= int(nums[1])):
                map_res['win_rounds'] = nums[0]
            else:
                map_res['win_rounds'] = nums[1]
            map_res['map'] = a.find('div', attrs={'class': 'dynamic-map-name-full'}).text

            winner = a.find('div', attrs={'class': 'stats-match-map-winner-logo-con'})
            # timing issue here !!!!
            #print(f'winner {winner}')
            map_res['winner_id'] = re.findall("\d+", winner.img['src'])[0]
            map_res['winner'] = winner.img['title']
            match_map_results.append(map_res)

        # scrape current webpage, save one visit
        for map_res in match_map_results:
            if map_res['href'] != match_info['href']:
                continue
            print(f'current page {map_res["href"]}')    
            mapstat = {}
            mapstat.update(map_res)
            # mapstatid
            mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", map_res['href'])[0]
            # scrape all others
            mapstat.update(map_match_scrape(contentCol))
            perf_mapstat.append(mapstat)

        # scrape all other webpages.
        for map_res in match_map_results:
            if map_res['href'] == match_info['href']:
                continue
            if 'mapstatsid' not in map_res['href']:
                continue

            # match performace main page
            wait_start = time.time()
            browser.visit(hltv_url + map_res['href'])
            browser.is_element_present_by_css("div.contentCol", wait_time=3)
            wait_time += time.time() - wait_start            
            contentCol = BeautifulSoup(browser.html, 'html.parser')       \
                             .find('div', attrs={'class': 'contentCol'})                

            print(f"'visit {map_res['href']}")            
            mapstat = {}
            mapstat.update(map_res)
            # mapstatid
            mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", map_res['href'])[0]
            # scrape all others
            mapstat.update(map_match_scrape(contentCol))
            perf_mapstat.append(mapstat)

    match_result['mapstat'] = perf_mapstat
    total = time.time() - start_time
    print(f'wait={wait_time}, total={total}')
    return match_result

## convert scrape result to pd

In [10]:
def df_padding(df, column_name, pad):
    df[column_name] = [pad] * df.shape[0]
    return df

def match_stat_df_get(match_res):
    # define the order.
    column_names = ['player_id','player','match_id','date',
                'kills', 'hs', 'assists', 'flash_assists', 'kdratio', 'deaths', 'kddiff', 'adr', 'fkdiff']
    player_match_df = pd.DataFrame(columns=column_names)
    
    for mapstat in match_res['mapstat']:
        player_map_df = pd.read_json(json.dumps(mapstat['stat']), orient='index')
        # padding
        player_map_df = df_padding(player_map_df, 'match_id', match_res['match_id'])
        player_map_df = df_padding(player_map_df, 'winner', mapstat['winner'])
        player_map_df = df_padding(player_map_df, 'winner_id', mapstat['winner_id'])
        
        player_map_df = df_padding(player_map_df, 'total_rounds', mapstat['total_rounds'])
        player_map_df = df_padding(player_map_df, 'mapstatid', mapstat['mapstatid'])
        
        player_map_df = df_padding(player_map_df, 'win_rounds', mapstat['win_rounds'])
        player_map_df = df_padding(player_map_df, 'map', mapstat['map'])
        
        t_str = datetime.utcfromtimestamp(match_res[key_timestamp]).strftime('%Y-%m-%d')
        player_map_df = df_padding(player_map_df, 'date', t_str)

        player_match_df = pd.concat([player_match_df, player_map_df])
    return player_match_df

## main routine

In [11]:
output_df_column_names = [
     'player_id',
     'player',
     'match_id',
     'date',
     'kills',
     'hs',
     'assists',
     'flash_assists',
     'kdratio',
     'deaths',
     'kddiff',
     'adr',
     'fkdiff',
     'rating',
     'teamname',
     'winner',
     'winner_id',
     'total_rounds',
     'mapstatid',
     'win_rounds',
     'map']

In [12]:
output_filename = ''
def match_list_scrape(browser, match_urls):
    if len(match_urls) == 0:
        return

    start_time = time.time()
    # 1st url's time pending to the output filename
    ts = match_urls[0]['ts']
    t1 = datetime.utcfromtimestamp(ts).strftime('%Y%m%d')
    # last url's time pending to the output filename
    ts = match_urls[-1]['ts']
    t2 = datetime.utcfromtimestamp(ts).strftime('%Y%m%d')
    output_filename = 'player_mapstat_' + t1 + '_' + t2 + '.csv'

    # create an empty file
    df = pd.DataFrame(columns=output_df_column_names)
    df.to_csv(output_filename, index=False)
    # append the file.
    count = 0
    for match_result in match_urls:
        res = match_scrape(browser, match_result)
        df = match_stat_df_get(res)
        df[output_df_column_names].to_csv(output_filename, mode='a', header=False)
        count += 1
        if ((count % 10) == 0):
            elapsed_time = time.time() - start_time
            print(f'scraped {count} matches in {elapsed_time} seconds.')

    elapsed_time = time.time() - start_time
    print(f'Done. scraped {count} matches in {elapsed_time} seconds.')

# input and run

In [13]:
start_date = '2020/6/10'
stop_date = '2020/6/9'
start_ts = datetime.timestamp(datetime.strptime(start_date, "%Y/%m/%d"))
stop_ts = datetime.timestamp(datetime.strptime(stop_date, "%Y/%m/%d"))

In [14]:
match_urls, next_url = match_list_get(browser, start_ts, stop_ts)
len(match_urls)

48

In [15]:
match_list_scrape(browser, match_urls)

single map
wait=1.0504469871520996, total=1.8529839515686035
single map
wait=0.5881462097167969, total=0.7510697841644287
single map
wait=1.7092702388763428, total=1.8671667575836182
single map
wait=1.3698632717132568, total=1.5737450122833252
single map
wait=0.8023068904876709, total=0.9462218284606934
single map
wait=0.7606003284454346, total=0.9335005283355713
multi-map match
current page /stats/matches/mapstatsid/103867/sgpro-vs-hellraisers
'visit /stats/matches/mapstatsid/103863/hellraisers-vs-sgpro
'visit /stats/matches/mapstatsid/103866/hellraisers-vs-sgpro
wait=1.8891184329986572, total=2.6826579570770264
multi-map match
current page /stats/matches/mapstatsid/103866/hellraisers-vs-sgpro
'visit /stats/matches/mapstatsid/103863/hellraisers-vs-sgpro
'visit /stats/matches/mapstatsid/103867/sgpro-vs-hellraisers
wait=1.8330752849578857, total=2.9764230251312256
multi-map match
current page /stats/matches/mapstatsid/103863/hellraisers-vs-sgpro
'visit /stats/matches/mapstatsid/103866/h

In [18]:
output_filename

''

## debug