In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
import json
import time
import re
from datetime import datetime

### init browser

In [2]:
# return a filename (with path) such that it is accessible.
def get_fullname(filename):
    # as long as it is accessible.
    if os.path.isfile(filename):
        return filename

    # search current working directory.
    cur_working_dir = os.getcwd()
    for i in range(50):
        for (path, dir, files) in os.walk(cur_working_dir):
            if filename in files:
                return os.path.join(path, filename)

        # check parent directory
        parent = os.path.dirname(cur_working_dir)
        if cur_working_dir == parent:
            break;
        cur_working_dir = parent

    # did not found, simply return.
    print(f"file {filename} not found !!!")
    return filename


# Initiate headless driver for deployment
browser = Browser("chrome", executable_path=get_fullname("chromedriver.exe"), headless=True)
#browser = Browser('chrome', executable_path=get_fullname("chromedriver.exe"))
time.sleep(0.1)

In [3]:
# base url
hltv_url = 'https://www.hltv.org'
output_filename = ''

In [4]:
key_perf_table = 'perf_table'
key_AWP_kills = 'AWP_kills'
## top level
key_timestamp = 'ts'
key_map_stats = 'map_stats'
key_teams = 'teams'
key_player_st = 'match_player_st'

## visist '/stats/matches'

In [5]:
# href="/stats/teams/4471/bemyfRAG"
# href="/stats/players/15090/PwnAlone"
def get_team_id(a):
    sl = a['href'].split('/')
    return sl[3]
get_player_id = get_team_id

def get_team_name(a):
    sl = a['href'].split('/')
    return sl[4]
get_player_name = get_team_name

#### only timestamp, map (if single map) and href are used later.

In [6]:
def match_list_get(browser, start, stop, next_url = '/stats/matches'):
    # start is the newer match data, larger value
    if start < stop:
        t = stop
        stop = start
        start = t

    start_url = False

    # Visit the site to Scrape
    ts = 0
    all_done = False
    match_urls = []

    for cnt in range(10000):
        # Visit the site to Scrape
        browser.visit(hltv_url + next_url)
        browser.is_element_present_by_tag('div.contentCol', wait_time=3)
        content = BeautifulSoup(browser.html, 'html.parser').find('div', attrs={"class": "contentCol"})

        # all matches in this page
        trs = content.tbody.find_all('tr')

        for tr in trs:
            match_info= {}

            # data-unix
            td = tr.find('td', attrs={'class': 'date-col'})
            t = int(td.div['data-unix'])/1000
            if (t > start):
                continue
            if (t <= stop):
                all_done = True
                str = datetime.utcfromtimestamp(t).strftime('%Y/%m/%d %H:%M:%S')
                print(f'Excluded 1st match timestamp: {str}')
                break

            if not start_url:
                start_url = True
                str = datetime.utcfromtimestamp(t).strftime('%Y/%m/%d %H:%M:%S')
                print(f'Included 1st match timestamp: {str}')
            match_info['href'] = td.a['href']
            match_info['ts'] = t
            # map
            match_info['map'] = tr.find('div', attrs={'class': 'dynamic-map-name-full'}).text
            match_urls.append(match_info)

        # keep current page
        # if in middle of the page, may overlapping, prevent missing matches
        if all_done:
            break
        # ========================================================================
        # next page
        # ========================================================================
        # next page
        a = content.find('a', attrs={"class": "pagination-next"}, href=True)
        if not a:
            print(f'Done {len(match_urls)} matches scraping.')
            break
        next_url = a['href']

        # only during development
        if cnt == 1000:
            print(f'Truncate the match lists !!!!!')
            print(f'Change next_url from: {next_url}')        
            next_url = '/stats/matches?offset=79700'
            print(f'                  to: {next_url}')  
    
    # return all urls and next_page_url
    return match_urls, next_url

#### player stat.

In [7]:
def percentage_get_(str):
    # 'nn.nn%'
    nums = re.findall('(\d*\.\d*\%)', str)
    if (nums):
        return float(nums[0].strip('%'))/100
    # 'nn.nn'
    nums = re.findall('(\d*\.\d*)', str)
    if (nums):
        return float(nums[0])
    # 'nn'
    nums = re.findall('(\d+)', str)    
    if (nums):
        return float(nums[0])
    return 0

# kratio
def kdratio_get(str):
    return round(percentage_get_(str), 4)

# assists
def assists_get(str):
    nums = re.findall("\d+", str)
    if len(nums) == 0:
        return '0', '0'
    elif len(nums) == 1:
        return nums[0], '0'
    return nums[0], nums[1]
# kills
def kills_get(str):
    return assists_get(str)

# mostly for 'adr'
def num_get(str):
    nums = re.findall("\d+", str)
    if len(nums):
        return nums[0]
    return '0'

In [8]:
# parse player stat.
def player_stat(player_tr):
#    items = ['player', 'kills', 'assists', 'deaths', 'kdratio', 'kddiff', 'adr', 'fkdiff', 'adr']
    player_st = {}
    # player
    st_player = player_tr.find('td', attrs={"class": "st-player"})
    player_id = get_player_id(st_player.a)
    player_st['player_id'] = player_id
    player_st['player'] = get_player_name(st_player.a) # st_player.text
    # parse 'kills'
    str = player_tr.find('td', attrs={"class": "st-kills"}).text
    player_st['kills'], player_st['hs'] = kills_get(str)
    # parse 'assists'
    str = player_tr.find('td', attrs={"class": "st-assists"}).text
    player_st['assists'], player_st['flash_assists'] = assists_get(str)
        
    # parse 'kdratio', remove '%' to float.
    player_st['kdratio'] = kdratio_get(str)
    # 'adr'
    item = 'adr'
    str = player_tr.find('td', attrs={"class": "st-" + item}).text
    player_st[item] = num_get(str)
    # parse all others.
    items = ['deaths', 'kddiff', 'fkdiff', 'rating']
    for item in items:
        player_st[item] = player_tr.find('td', attrs={"class": "st-" + item}).text

    return {player_id : player_st}

def team_stat_scrape(st_table):
    # 
    teamname = st_table.find("th", attrs={"class": "st-teamname"}).text

    # team member st.
    player_stat_table = {}
    for tr in st_table.find("tbody").find_all("tr"):
        player_stat_table.update(player_stat(tr))

    # build team/teammate map
    teammates = {}
    for id_ in player_stat_table:
        # insert 'teamname'
        player_stat_table[id_]['teamname'] = teamname
        player = {id_: player_stat_table[id_]['player']}
        teammates.update(player)
    
    # {teamname: {{id: name}, .., {id: name}}
    team = {teamname : teammates}
    return team, player_stat_table

def stats_table_scrape(stats_table):
    ### team
    teams = []
    player_stat_table = {}
    # print(stats_table)
    for stat_table in stats_table:
        team, player_st = team_stat_scrape(stat_table)
        teams.append(team)
        player_stat_table.update(player_st)

    #display(pd.read_json(json.dumps(teams[0]), orient='index'))
    #display(pd.read_json(json.dumps(teams[1]), orient='index'))
    #display(pd.read_json(json.dumps(player_stat_table), orient='index'))
    return teams, player_stat_table

def map_match_scrape(contentCol):
    mapstat = {}
    # mapstatid
    stats_table = contentCol.find_all('table', attrs={'class': 'stats-table'})
    teams, player_st = stats_table_scrape(stats_table)
    mapstat['team'] = teams
    mapstat['stat'] = player_st
    return mapstat

### entrance of match scrape

In [9]:

def match_scrape(brower, match_info, timedelay=0.1):
    start_time = time.time()
    match_result = {}

    # match performace main page
    wait_start = time.time()
    browser.visit(hltv_url + match_info['href'])
    rdy = browser.is_element_present_by_css("div.contentCol", wait_time=3)
    if rdy == False:
        href = match_info['href']
        print(f'href={href}, 3 sec timeout, skip it...')
        return match_result

    # hold all output scrape results.
    match_result.update(match_info)
    # print(match_result)    

    wait_time = time.time() - wait_start
    contentCol = BeautifulSoup(browser.html, 'html.parser')       \
                 .find('div', attrs={'class': 'contentCol'})

    # 'match_id' from 'match-page-link'
    try:
        match_page_link = contentCol.find('a', attrs={'class': 'match-page-link'})
        match_result['match_id'] = re.findall('/(\d+)/', match_page_link['href'])[0]
    except:
        href = match_info['href']
        print(f'exception: href={href}')
        print(f'   match_page_link={match_page_link}')
        return {}

    # mapstat
    mapstat = {}
    mapstat['map'] = match_info['map']
    left = contentCol.find('div', attrs={'class': 'team-left'})
    right = contentCol.find('div', attrs={'class': 'team-right'})
    mapstat['total_rounds'] = int(left.div.text) + int(right.div.text)
    if (int(left.div.text) >= int(right.div.text)):
        winner = left
    else:
        winner = right
    mapstat['winner'] = get_team_name(winner.a)
    mapstat['winner_id'] = get_team_id(winner.a)
    mapstat['win_rounds'] = winner.div.text

    # mapstatid
    mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", match_info['href'])[0]
    # scrape all others
    mapstat.update(map_match_scrape(contentCol))

    match_result['mapstat'] = mapstat
    total = time.time() - start_time
    # print(f'wait={wait_time}, total={total}')
    return match_result

## convert scrape result to pd

In [10]:
def df_padding(df, column_name, pad):
    df[column_name] = [pad] * df.shape[0]
    return df

def match_stat_df_get(match_res):
    # define the order.
    column_names = ['player_id','player','match_id','date',
                'kills', 'hs', 'assists', 'flash_assists', 'kdratio', 'deaths', 'kddiff', 'adr', 'fkdiff']
    player_match_df = pd.DataFrame(columns=column_names)

#    for mapstat in match_res['mapstat']:
    mapstat = match_res['mapstat']
    if mapstat:
        player_map_df = pd.read_json(json.dumps(mapstat['stat']), orient='index')
        # padding
        player_map_df = df_padding(player_map_df, 'match_id', match_res['match_id'])
        player_map_df = df_padding(player_map_df, 'winner', mapstat['winner'])
        player_map_df = df_padding(player_map_df, 'winner_id', mapstat['winner_id'])
        
        player_map_df = df_padding(player_map_df, 'total_rounds', mapstat['total_rounds'])
        player_map_df = df_padding(player_map_df, 'mapstatid', mapstat['mapstatid'])
        
        player_map_df = df_padding(player_map_df, 'win_rounds', mapstat['win_rounds'])
        player_map_df = df_padding(player_map_df, 'map', mapstat['map'])
        
        t_str = datetime.utcfromtimestamp(match_res[key_timestamp]).strftime('%Y-%m-%d')
        player_map_df = df_padding(player_map_df, 'date', t_str)

        player_match_df = pd.concat([player_match_df, player_map_df])
    return player_match_df

## main routine

In [11]:
output_df_column_names = [
     'player_id',
     'player',
     'match_id',
     'date',
     'kills',
     'hs',
     'assists',
     'flash_assists',
     'kdratio',
     'deaths',
     'kddiff',
     'adr',
     'fkdiff',
     'rating',
     'teamname',
     'winner',
     'winner_id',
     'total_rounds',
     'mapstatid',
     'win_rounds',
     'map']

In [12]:
def match_list_scrape(browser, match_urls):
    if len(match_urls) == 0:
        return

    start_time = time.time()
    # 1st url's time pending to the output filename
    ts = match_urls[0]['ts']
    t1 = datetime.utcfromtimestamp(ts).strftime('%Y%m%d')
    # last url's time pending to the output filename
    ts = match_urls[-1]['ts']
    t2 = datetime.utcfromtimestamp(ts).strftime('%Y%m%d')
    output_filename = 'player_mapstat_' + t1 + '_' + t2 + '.csv'

    # create an empty file
    df = pd.DataFrame(columns=output_df_column_names)
    df.to_csv(output_filename, index=False)
    # append the file.
    count = 0
    for match_result in match_urls:
        res = match_scrape(browser, match_result)
        if len(res) == 0:
            continue
        df = match_stat_df_get(res)
        df[output_df_column_names].to_csv(output_filename, mode='a', header=False)
        count += 1
        if ((count % 20) == 0):
            elapsed_time = time.time() - start_time
            print(f'scraped {count} matches in {elapsed_time} seconds.')

    elapsed_time = time.time() - start_time
    print(f'Done. scraped {count} matches in {elapsed_time} seconds.')

# config and run

In [13]:
dates = [
#    '2017/1/1',
#    '2016/12/1',
#    '2016/11/1',
#    '2016/10/1',
    '2016/9/15',
    '2016/9/1',
    '2016/8/1',
    '2016/7/1',
    '2016/6/1',
    '2016/5/1',
    '2016/4/1',
    '2016/3/1',
    '2016/2/1',
    '2016/1/1',
]
#next_url = '/stats/matches'
next_url = '/stats/matches?offset=60500'

In [14]:
# current date and time
now = datetime.now()
now_str = now.strftime("%m/%d/%Y, %H:%M:%S")
print(f"now {now_str}")

start_date = None
for stop_date in dates:
    if not start_date:
        start_date = stop_date
        continue
    start_ts = datetime.timestamp(datetime.strptime(start_date, "%Y/%m/%d"))
    stop_ts = datetime.timestamp(datetime.strptime(stop_date, "%Y/%m/%d"))

    match_urls, next_url = match_list_get(browser, start_ts, stop_ts, next_url)
    print(f'total {len(match_urls)}, next_url={next_url}\n\n')


    match_list_scrape(browser, match_urls)

#    print(f'{start_date}, {stop_date}')
    start_date = stop_date

    # current date and time
    now = datetime.now()
    now_str = now.strftime("%m/%d/%Y, %H:%M:%S")
    print(f"now {now_str}\n\n")

now 06/22/2020, 09:46:52
Included 1st match timestamp: 2016/09/13 02:30:00
Excluded 1st match timestamp: 2016/09/01 03:15:00
total 481, next_url=/stats/matches?offset=60950


scraped 20 matches in 20.356404304504395 seconds.
exception: href=/stats/matches/mapstatsid/35104/selfless-vs-prospects
   match_page_link=<a class="match-page-link button">More info on match page</a>
exception: href=/stats/matches/mapstatsid/35103/prospects-vs-selfless
   match_page_link=<a class="match-page-link button">More info on match page</a>
scraped 40 matches in 45.65940546989441 seconds.
scraped 60 matches in 68.93723464012146 seconds.
scraped 80 matches in 93.44985866546631 seconds.
scraped 100 matches in 116.42384004592896 seconds.
scraped 120 matches in 139.12050032615662 seconds.
scraped 140 matches in 163.63787150382996 seconds.
scraped 160 matches in 188.99546122550964 seconds.
scraped 180 matches in 214.67884016036987 seconds.
scraped 200 matches in 239.27339100837708 seconds.
scraped 220 matches 

scraped 500 matches in 797.0614252090454 seconds.
scraped 520 matches in 824.446540594101 seconds.
scraped 540 matches in 854.448207616806 seconds.
scraped 560 matches in 885.7122383117676 seconds.
scraped 580 matches in 916.3217477798462 seconds.
scraped 600 matches in 947.6625578403473 seconds.
scraped 620 matches in 978.1076319217682 seconds.
scraped 640 matches in 1008.324684381485 seconds.
scraped 660 matches in 1037.7654695510864 seconds.
scraped 680 matches in 1068.0549824237823 seconds.
scraped 700 matches in 1097.5023412704468 seconds.
Done. scraped 702 matches in 1099.9854834079742 seconds.
now 06/22/2020, 11:05:41


Included 1st match timestamp: 2016/06/01 00:00:00
Excluded 1st match timestamp: 2016/05/01 02:00:00
total 862, next_url=/stats/matches?offset=64400


scraped 20 matches in 31.6463725566864 seconds.
scraped 40 matches in 69.34147477149963 seconds.
scraped 60 matches in 105.19058585166931 seconds.
scraped 80 matches in 144.99957966804504 seconds.
scraped 100 matche

Done. scraped 1050 matches in 1868.2878139019012 seconds.
now 06/22/2020, 12:28:47


Included 1st match timestamp: 2016/03/01 03:45:00
Excluded 1st match timestamp: 2016/02/01 05:30:00
total 891, next_url=/stats/matches?offset=67250


scraped 20 matches in 34.23291897773743 seconds.
scraped 40 matches in 73.62267112731934 seconds.
scraped 60 matches in 114.1174647808075 seconds.
scraped 80 matches in 158.40781593322754 seconds.
scraped 100 matches in 196.05120944976807 seconds.
scraped 120 matches in 228.43134927749634 seconds.
scraped 140 matches in 262.9115786552429 seconds.
scraped 160 matches in 295.6639199256897 seconds.
scraped 180 matches in 330.1697075366974 seconds.
scraped 200 matches in 366.244211435318 seconds.
scraped 220 matches in 400.0146098136902 seconds.
scraped 240 matches in 433.5201983451843 seconds.
scraped 260 matches in 468.64870595932007 seconds.
scraped 280 matches in 503.75652551651 seconds.
scraped 300 matches in 538.5182464122772 seconds.
scraped 320 matche