In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
import json
import time
import re
from datetime import datetime

### init browser

In [2]:
# return a filename (with path) such that it is accessible.
def get_fullname(filename):
    # as long as it is accessible.
    if os.path.isfile(filename):
        return filename

    # search current working directory.
    cur_working_dir = os.getcwd()
    for i in range(50):
        for (path, dir, files) in os.walk(cur_working_dir):
            if filename in files:
                return os.path.join(path, filename)

        # check parent directory
        parent = os.path.dirname(cur_working_dir)
        if cur_working_dir == parent:
            break;
        cur_working_dir = parent

    # did not found, simply return.
    print(f"file {filename} not found !!!")
    return filename


# Initiate headless driver for deployment
browser = Browser("chrome", executable_path=get_fullname("chromedriver.exe"), headless=True)
#browser = Browser('chrome', executable_path=get_fullname("chromedriver.exe"))
time.sleep(0.1)

In [3]:
# base url
hltv_url = 'https://www.hltv.org'
output_filename = ''

In [4]:
key_perf_table = 'perf_table'
key_AWP_kills = 'AWP_kills'
## top level
key_timestamp = 'ts'
key_map_stats = 'map_stats'
key_teams = 'teams'
key_player_st = 'match_player_st'

## visist '/stats/matches'

In [5]:
# href="/stats/teams/4471/bemyfRAG"
# href="/stats/players/15090/PwnAlone"
def get_team_id(a):
    sl = a['href'].split('/')
    return sl[3]
get_player_id = get_team_id

def get_team_name(a):
    sl = a['href'].split('/')
    return sl[4]
get_player_name = get_team_name

#### only timestamp, map (if single map) and href are used later.

In [6]:
def match_list_get(browser, start, stop, next_url = '/stats/matches'):
    # start is the newer match data, larger value
    if start < stop:
        t = stop
        stop = start
        start = t

    start_url = False

    # Visit the site to Scrape
    ts = 0
    all_done = False
    match_urls = []

    for cnt in range(10000):
        # Visit the site to Scrape
        browser.visit(hltv_url + next_url)
        browser.is_element_present_by_tag('div.contentCol', wait_time=3)
        content = BeautifulSoup(browser.html, 'html.parser').find('div', attrs={"class": "contentCol"})

        # all matches in this page
        trs = content.tbody.find_all('tr')

        for tr in trs:
            match_info= {}

            # data-unix
            td = tr.find('td', attrs={'class': 'date-col'})
            t = int(td.div['data-unix'])/1000
            if (t > start):
                continue
            if (t <= stop):
                all_done = True
                str = datetime.utcfromtimestamp(t).strftime('%Y/%m/%d %H:%M:%S')
                print(f'Excluded 1st match timestamp: {str}')
                break

            if not start_url:
                start_url = True
                str = datetime.utcfromtimestamp(t).strftime('%Y/%m/%d %H:%M:%S')
                print(f'Included 1st match timestamp: {str}')
            match_info['href'] = td.a['href']
            match_info['ts'] = t
            # map
            match_info['map'] = tr.find('div', attrs={'class': 'dynamic-map-name-full'}).text
            match_urls.append(match_info)

        # keep current page
        # if in middle of the page, may overlapping, prevent missing matches
        if all_done:
            break
        # ========================================================================
        # next page
        # ========================================================================
        # next page
        a = content.find('a', attrs={"class": "pagination-next"}, href=True)
        if not a:
            print(f'Done {len(match_urls)} matches scraping.')
            break
        next_url = a['href']

        # only during development
        if cnt == 1000:
            print(f'Truncate the match lists !!!!!')
            print(f'Change next_url from: {next_url}')        
            next_url = '/stats/matches?offset=79700'
            print(f'                  to: {next_url}')  
    
    # return all urls and next_page_url
    return match_urls, next_url

#### player stat.

In [7]:
def percentage_get_(str):
    # 'nn.nn%'
    nums = re.findall('(\d*\.\d*\%)', str)
    if (nums):
        return float(nums[0].strip('%'))/100
    # 'nn.nn'
    nums = re.findall('(\d*\.\d*)', str)
    if (nums):
        return float(nums[0])
    # 'nn'
    nums = re.findall('(\d+)', str)    
    if (nums):
        return float(nums[0])
    return 0

# kratio
def kdratio_get(str):
    return round(percentage_get_(str), 4)

# assists
def assists_get(str):
    nums = re.findall("\d+", str)
    if len(nums) == 0:
        return '0', '0'
    elif len(nums) == 1:
        return nums[0], '0'
    return nums[0], nums[1]
# kills
def kills_get(str):
    return assists_get(str)

# mostly for 'adr'
def num_get(str):
    nums = re.findall("\d+", str)
    if len(nums):
        return nums[0]
    return '0'

In [8]:
# parse player stat.
def player_stat(player_tr):
#    items = ['player', 'kills', 'assists', 'deaths', 'kdratio', 'kddiff', 'adr', 'fkdiff', 'adr']
    player_st = {}
    # player
    st_player = player_tr.find('td', attrs={"class": "st-player"})
    player_id = get_player_id(st_player.a)
    player_st['player_id'] = player_id
    player_st['player'] = get_player_name(st_player.a) # st_player.text
    # parse 'kills'
    str = player_tr.find('td', attrs={"class": "st-kills"}).text
    player_st['kills'], player_st['hs'] = kills_get(str)
    # parse 'assists'
    str = player_tr.find('td', attrs={"class": "st-assists"}).text
    player_st['assists'], player_st['flash_assists'] = assists_get(str)
        
    # parse 'kdratio', remove '%' to float.
    player_st['kdratio'] = kdratio_get(str)
    # 'adr'
    item = 'adr'
    str = player_tr.find('td', attrs={"class": "st-" + item}).text
    player_st[item] = num_get(str)
    # parse all others.
    items = ['deaths', 'kddiff', 'fkdiff', 'rating']
    for item in items:
        player_st[item] = player_tr.find('td', attrs={"class": "st-" + item}).text

    return {player_id : player_st}

def team_stat_scrape(st_table):
    # 
    teamname = st_table.find("th", attrs={"class": "st-teamname"}).text

    # team member st.
    player_stat_table = {}
    for tr in st_table.find("tbody").find_all("tr"):
        player_stat_table.update(player_stat(tr))

    # build team/teammate map
    teammates = {}
    for id_ in player_stat_table:
        # insert 'teamname'
        player_stat_table[id_]['teamname'] = teamname
        player = {id_: player_stat_table[id_]['player']}
        teammates.update(player)
    
    # {teamname: {{id: name}, .., {id: name}}
    team = {teamname : teammates}
    return team, player_stat_table

def stats_table_scrape(stats_table):
    ### team
    teams = []
    player_stat_table = {}
    # print(stats_table)
    for stat_table in stats_table:
        team, player_st = team_stat_scrape(stat_table)
        teams.append(team)
        player_stat_table.update(player_st)

    #display(pd.read_json(json.dumps(teams[0]), orient='index'))
    #display(pd.read_json(json.dumps(teams[1]), orient='index'))
    #display(pd.read_json(json.dumps(player_stat_table), orient='index'))
    return teams, player_stat_table

def map_match_scrape(contentCol):
    mapstat = {}
    # mapstatid
    stats_table = contentCol.find_all('table', attrs={'class': 'stats-table'})
    teams, player_st = stats_table_scrape(stats_table)
    mapstat['team'] = teams
    mapstat['stat'] = player_st
    return mapstat

### entrance of match scrape

In [9]:

def match_scrape(brower, match_info, timedelay=0.1):
    start_time = time.time()
    match_result = {}

    # match performace main page
    wait_start = time.time()
    browser.visit(hltv_url + match_info['href'])
    rdy = browser.is_element_present_by_css("div.contentCol", wait_time=3)
    if rdy == False:
        href = match_info['href']
        print(f'href={href}, 3 sec timeout, skip it...')
        return match_result

    # hold all output scrape results.
    match_result.update(match_info)
    # print(match_result)    

    wait_time = time.time() - wait_start
    contentCol = BeautifulSoup(browser.html, 'html.parser')       \
                 .find('div', attrs={'class': 'contentCol'})

    # 'match_id' from 'match-page-link'
    match_page_link = contentCol.find('a', attrs={'class': 'match-page-link'})
    match_result['match_id'] = re.findall('/(\d+)/', match_page_link['href'])[0]

    # mapstat
    mapstat = {}
    mapstat['map'] = match_info['map']
    left = contentCol.find('div', attrs={'class': 'team-left'})
    right = contentCol.find('div', attrs={'class': 'team-right'})
    mapstat['total_rounds'] = int(left.div.text) + int(right.div.text)
    if (int(left.div.text) >= int(right.div.text)):
        winner = left
    else:
        winner = right
    mapstat['winner'] = get_team_name(winner.a)
    mapstat['winner_id'] = get_team_id(winner.a)
    mapstat['win_rounds'] = winner.div.text

    # mapstatid
    mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", match_info['href'])[0]
    # scrape all others
    mapstat.update(map_match_scrape(contentCol))

    match_result['mapstat'] = mapstat
    total = time.time() - start_time
    # print(f'wait={wait_time}, total={total}')
    return match_result

## convert scrape result to pd

In [10]:
def df_padding(df, column_name, pad):
    df[column_name] = [pad] * df.shape[0]
    return df

def match_stat_df_get(match_res):
    # define the order.
    column_names = ['player_id','player','match_id','date',
                'kills', 'hs', 'assists', 'flash_assists', 'kdratio', 'deaths', 'kddiff', 'adr', 'fkdiff']
    player_match_df = pd.DataFrame(columns=column_names)

#    for mapstat in match_res['mapstat']:
    mapstat = match_res['mapstat']
    if mapstat:
        player_map_df = pd.read_json(json.dumps(mapstat['stat']), orient='index')
        # padding
        player_map_df = df_padding(player_map_df, 'match_id', match_res['match_id'])
        player_map_df = df_padding(player_map_df, 'winner', mapstat['winner'])
        player_map_df = df_padding(player_map_df, 'winner_id', mapstat['winner_id'])
        
        player_map_df = df_padding(player_map_df, 'total_rounds', mapstat['total_rounds'])
        player_map_df = df_padding(player_map_df, 'mapstatid', mapstat['mapstatid'])
        
        player_map_df = df_padding(player_map_df, 'win_rounds', mapstat['win_rounds'])
        player_map_df = df_padding(player_map_df, 'map', mapstat['map'])
        
        t_str = datetime.utcfromtimestamp(match_res[key_timestamp]).strftime('%Y-%m-%d')
        player_map_df = df_padding(player_map_df, 'date', t_str)

        player_match_df = pd.concat([player_match_df, player_map_df])
    return player_match_df

## main routine

In [11]:
output_df_column_names = [
     'player_id',
     'player',
     'match_id',
     'date',
     'kills',
     'hs',
     'assists',
     'flash_assists',
     'kdratio',
     'deaths',
     'kddiff',
     'adr',
     'fkdiff',
     'rating',
     'teamname',
     'winner',
     'winner_id',
     'total_rounds',
     'mapstatid',
     'win_rounds',
     'map']

In [12]:
def match_list_scrape(browser, match_urls):
    if len(match_urls) == 0:
        return

    start_time = time.time()
    # 1st url's time pending to the output filename
    ts = match_urls[0]['ts']
    t1 = datetime.utcfromtimestamp(ts).strftime('%Y%m%d')
    # last url's time pending to the output filename
    ts = match_urls[-1]['ts']
    t2 = datetime.utcfromtimestamp(ts).strftime('%Y%m%d')
    output_filename = 'player_mapstat_' + t1 + '_' + t2 + '.csv'

    # create an empty file
    df = pd.DataFrame(columns=output_df_column_names)
    df.to_csv(output_filename, index=False)
    # append the file.
    count = 0
    for match_result in match_urls:
        res = match_scrape(browser, match_result)
        if len(res) == 0:
            continue
        df = match_stat_df_get(res)
        df[output_df_column_names].to_csv(output_filename, mode='a', header=False)
        count += 1
        if ((count % 10) == 0):
            elapsed_time = time.time() - start_time
            print(f'scraped {count} matches in {elapsed_time} seconds.')

    elapsed_time = time.time() - start_time
    print(f'Done. scraped {count} matches in {elapsed_time} seconds.')

# config and run

In [13]:
dates = [
#    '2017/7/1',
#    '2017/6/15',
#    '2017/6/1',
#    '2017/5/15',
#    '2017/5/1',
#    '2017/4/15',
#    '2017/4/1',
#    '2017/3/15',
#    '2017/3/1',
#    '2017/2/15',
#    '2017/2/1',
#    '2017/1/15',
#    '2017/1/1'

# '2017/12/15',
 '2017/12/1',
 '2017/11/15',
 '2017/11/1',
 '2017/10/15',
 '2017/10/1',
 '2017/9/15',
 '2017/9/1',
 '2017/8/15',
 '2017/8/1',
 '2017/7/15',
 '2017/7/1'
]
#next_url = '/stats/matches'
next_url = '/stats/matches?offset=40750'

In [14]:
# current date and time
now = datetime.now()
now_str = now.strftime("%m/%d/%Y, %H:%M:%S")
print(f"now {now_str}")

start_date = None
for stop_date in dates:
    if not start_date:
        start_date = stop_date
        continue
    start_ts = datetime.timestamp(datetime.strptime(start_date, "%Y/%m/%d"))
    stop_ts = datetime.timestamp(datetime.strptime(stop_date, "%Y/%m/%d"))

    match_urls, next_url = match_list_get(browser, start_ts, stop_ts, next_url)
    print(f'total {len(match_urls)}, next_url={next_url}\n\n')


    match_list_scrape(browser, match_urls)

#    print(f'{start_date}, {stop_date}')
    start_date = stop_date

    # current date and time
    now = datetime.now()
    now_str = now.strftime("%m/%d/%Y, %H:%M:%S")
    print(f"now {now_str}\n\n")

now 06/21/2020, 18:53:05
Included 1st match timestamp: 2017/12/01 02:15:00
Excluded 1st match timestamp: 2017/11/15 08:00:00
total 658, next_url=/stats/matches?offset=41450


scraped 10 matches in 15.444824934005737 seconds.
scraped 20 matches in 31.19064998626709 seconds.
scraped 30 matches in 46.01625084877014 seconds.
scraped 40 matches in 62.49770975112915 seconds.
scraped 50 matches in 77.76841950416565 seconds.
scraped 60 matches in 91.26409864425659 seconds.
scraped 70 matches in 106.50798845291138 seconds.
scraped 80 matches in 121.56010866165161 seconds.
scraped 90 matches in 135.49291801452637 seconds.
scraped 100 matches in 149.17249727249146 seconds.
scraped 110 matches in 163.27400422096252 seconds.
scraped 120 matches in 176.48364448547363 seconds.
scraped 130 matches in 191.14807558059692 seconds.
scraped 140 matches in 205.88872480392456 seconds.
scraped 150 matches in 220.32271027565002 seconds.
scraped 160 matches in 236.21842646598816 seconds.
scraped 170 matches in 

scraped 910 matches in 1471.520277261734 seconds.
scraped 920 matches in 1488.4249353408813 seconds.
scraped 930 matches in 1503.6242582798004 seconds.
scraped 940 matches in 1520.4924938678741 seconds.
scraped 950 matches in 1538.3786249160767 seconds.
scraped 960 matches in 1554.6327874660492 seconds.
scraped 970 matches in 1570.7183527946472 seconds.
scraped 980 matches in 1586.9253089427948 seconds.
scraped 990 matches in 1604.24183177948 seconds.
scraped 1000 matches in 1619.1186492443085 seconds.
scraped 1010 matches in 1634.321040391922 seconds.
Done. scraped 1011 matches in 1635.5141372680664 seconds.
now 06/21/2020, 19:37:50


Included 1st match timestamp: 2017/11/01 02:40:00
Excluded 1st match timestamp: 2017/10/15 05:00:00
total 983, next_url=/stats/matches?offset=43450


scraped 10 matches in 16.056468725204468 seconds.
scraped 20 matches in 32.69865107536316 seconds.
scraped 30 matches in 48.04499268531799 seconds.
scraped 40 matches in 61.24613976478577 seconds.
scraped 5

scraped 460 matches in 758.5960397720337 seconds.
scraped 470 matches in 775.4023892879486 seconds.
scraped 480 matches in 794.195122718811 seconds.
scraped 490 matches in 809.8922953605652 seconds.
scraped 500 matches in 826.6014924049377 seconds.
scraped 510 matches in 842.4238829612732 seconds.
scraped 520 matches in 857.8937892913818 seconds.
scraped 530 matches in 874.1323208808899 seconds.
scraped 540 matches in 890.3810336589813 seconds.
scraped 550 matches in 907.5496261119843 seconds.
scraped 560 matches in 922.5836420059204 seconds.
scraped 570 matches in 937.8915450572968 seconds.
scraped 580 matches in 953.9149324893951 seconds.
scraped 590 matches in 970.5448579788208 seconds.
scraped 600 matches in 986.9479908943176 seconds.
scraped 610 matches in 1003.4608552455902 seconds.
scraped 620 matches in 1018.9324309825897 seconds.
scraped 630 matches in 1034.7910361289978 seconds.
scraped 640 matches in 1050.1492717266083 seconds.
scraped 650 matches in 1067.4566140174866 secon

scraped 40 matches in 53.72389030456543 seconds.
scraped 50 matches in 67.20447301864624 seconds.
scraped 60 matches in 81.8720293045044 seconds.
scraped 70 matches in 96.62696647644043 seconds.
scraped 80 matches in 111.25966811180115 seconds.
scraped 90 matches in 125.61813759803772 seconds.
scraped 100 matches in 139.44996237754822 seconds.
scraped 110 matches in 153.34534454345703 seconds.
scraped 120 matches in 166.6421937942505 seconds.
scraped 130 matches in 180.39696502685547 seconds.
scraped 140 matches in 194.9520959854126 seconds.
scraped 150 matches in 210.02984404563904 seconds.
scraped 160 matches in 222.85404539108276 seconds.
scraped 170 matches in 236.16945934295654 seconds.
scraped 180 matches in 251.05633687973022 seconds.
scraped 190 matches in 264.32267355918884 seconds.
scraped 200 matches in 278.15466237068176 seconds.
scraped 210 matches in 292.5275549888611 seconds.
scraped 220 matches in 306.1624834537506 seconds.
scraped 230 matches in 320.4932897090912 secon

scraped 900 matches in 1347.8016078472137 seconds.
scraped 910 matches in 1364.599235534668 seconds.
scraped 920 matches in 1380.7921946048737 seconds.
scraped 930 matches in 1393.6764028072357 seconds.
scraped 940 matches in 1408.146897315979 seconds.
scraped 950 matches in 1424.3039305210114 seconds.
scraped 960 matches in 1438.1468527317047 seconds.
scraped 970 matches in 1453.5372622013092 seconds.
scraped 980 matches in 1468.9443230628967 seconds.
scraped 990 matches in 1484.472290277481 seconds.
scraped 1000 matches in 1498.859309911728 seconds.
scraped 1010 matches in 1514.8940696716309 seconds.
Done. scraped 1014 matches in 1520.0054559707642 seconds.
now 06/21/2020, 21:42:45


Included 1st match timestamp: 2017/08/15 06:10:00
Excluded 1st match timestamp: 2017/07/31 23:40:00
total 476, next_url=/stats/matches?offset=47600


scraped 10 matches in 16.30454421043396 seconds.
scraped 20 matches in 31.02695083618164 seconds.
scraped 30 matches in 47.55993604660034 seconds.
scraped 

scraped 260 matches in 446.2573187351227 seconds.
scraped 270 matches in 460.30045461654663 seconds.
scraped 280 matches in 476.4118142127991 seconds.
scraped 290 matches in 490.6621825695038 seconds.
scraped 300 matches in 504.9419844150543 seconds.
scraped 310 matches in 519.2802481651306 seconds.
scraped 320 matches in 533.7038404941559 seconds.
scraped 330 matches in 550.8151776790619 seconds.
scraped 340 matches in 564.6412551403046 seconds.
scraped 350 matches in 580.7424464225769 seconds.
scraped 360 matches in 595.3854446411133 seconds.
scraped 370 matches in 610.1029553413391 seconds.
scraped 380 matches in 625.9483554363251 seconds.
scraped 390 matches in 641.9904406070709 seconds.
scraped 400 matches in 658.1928946971893 seconds.
scraped 410 matches in 673.1702256202698 seconds.
scraped 420 matches in 687.157763004303 seconds.
scraped 430 matches in 702.0441083908081 seconds.
scraped 440 matches in 717.6279864311218 seconds.
scraped 450 matches in 732.6269924640656 seconds.
