In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
import json
import time
from datetime import datetime

In [2]:
# return a filename (with path) such that it is accessible. 
def get_fullname(filename):
    # as long as it is accessible.
    if os.path.isfile(filename):
        return filename

    # search current working directory.
    cur_working_dir = os.getcwd()
    for i in range(50):
        for (path, dir, files) in os.walk(cur_working_dir):
            if filename in files:
                return os.path.join(path, filename)

        # check parent directory
        parent = os.path.dirname(cur_working_dir)
        if cur_working_dir == parent:
            break;
        cur_working_dir = parent

    # did not found, simply return.
    print(f"file {filename} not found !!!")
    return filename


# Initiate headless driver for deployment
browser = Browser("chrome", executable_path=get_fullname("chromedriver.exe"), headless=True)
time.sleep(0.1)
#browser = Browser('chrome', executable_path=get_fullname("chromedriver.exe"))

In [3]:
# base url
hltv_url = 'https://www.hltv.org'

In [4]:
key_perf_table = 'perf_table'
key_AWP_kills = 'AWP_kills'
## top level
key_timestamp = 'ts'
key_map_stats = 'map_stats'
key_teams = 'teams'
key_player_st = 'match_player_st'

## visist '/stats/matches'

In [5]:
# href="/stats/teams/4471/bemyfRAG"
# href="/stats/players/15090/PwnAlone"
def get_team_id(a):
    sl = a['href'].split('/')
    return sl[3]
get_player_id = get_team_id

def get_team_name(a):
    sl = a['href'].split('/')
    return sl[4]
get_player_name = get_team_name

#### only timestamp, map (if single map) and href are used later.

In [6]:
# Visit the site to Scrape
next_url = '/stats/matches'
match_urls = []
for cnt in range(0):
    # Visit the site to Scrape
    browser.visit(hltv_url + next_url)
    browser.is_element_present_by_tag('div.contentCol', wait_time=3)
    content = BeautifulSoup(browser.html, 'html.parser').find('div', attrs={"class": "contentCol"})

    # all matches in this page
    trs = content.tbody.find_all('tr')
    
    for tr in trs:
        match_info= {} 
        
        # data-unix
        td = tr.find('td', attrs={'class': 'date-col'})
        match_info['href'] = td.a['href']
        t = int(td.div['data-unix'])/1000
        match_info['ts'] = t
        t_str = datetime.utcfromtimestamp(t).strftime('%Y-%m-%d')
        # teams
        tds = tr.find_all('td', attrs={'class': 'team-col'})
        match_info['team1'] = tds[0].a.text
        match_info['team2'] = tds[1].a.text
        # map
        match_info['map'] = tr.find('div', attrs={'class': 'dynamic-map-name-full'}).text
        match_urls.append(match_info)        

    # ========================================================================
    # next page
    # ========================================================================
    # next page
    a = content.find('a', attrs={"class": "pagination-next"}, href=True)
    if not a:
        print(f'Done {len(match_urls)} matches scraping.')
        break
    next_url = a['href']
    
    # only during development
    if cnt == 10:
        print(f'Truncate the match lists !!!!!')
        print(f'Change next_url from: {next_url}')        
        next_url = '/stats/matches?offset=79700'
        print(f'                  to: {next_url}')   

In [7]:
match_urls[:3]

[]

In [8]:
## overwrite with two matches only
match_urls = [
    # single map match
    {'href': '/stats/matches/mapstatsid/104026/new-england-whalers-vs-levitate',
     'ts': 1591920300.0,
     'team1': 'New England Whalers',
     'team2': 'Levitate',
     'map': 'Dust2'},
    # multi-map match
    {'href': '/stats/matches/mapstatsid/103972/espada-vs-navi-junior',
        'ts': 1591884000.0,
     'team1': 'Espada',
     'team2': 'NAVI Junior',
     'map': 'Inferno'}
]
match_urls

[{'href': '/stats/matches/mapstatsid/104026/new-england-whalers-vs-levitate',
  'ts': 1591920300.0,
  'team1': 'New England Whalers',
  'team2': 'Levitate',
  'map': 'Dust2'},
 {'href': '/stats/matches/mapstatsid/103972/espada-vs-navi-junior',
  'ts': 1591884000.0,
  'team1': 'Espada',
  'team2': 'NAVI Junior',
  'map': 'Inferno'}]

#### AWP kills

In [9]:
def AWP_kills(performance):
    AWP_content = performance.find(id='AWP-content')
    trs = AWP_content.table.tbody.find_all('tr')

    # column 
    team1 = []
    for td in trs[0].find_all('td', attrs={'class': 'team1'}):
        team1.append(get_player_id(td.a))

    AWP_kills = {}
    for idx, tr in enumerate(trs):
        if idx == 0:
            continue
        tds = tr.find_all('td')
        row = {}
        for i, td in enumerate(tds):
            if (i == 0):
                player_id = get_player_id(td.a)
            else:
                row[team1[i - 1]] = td.text
        AWP_kills[player_id] = row
    # debug only
#    display(pd.read_json(json.dumps(AWP_kills), orient='index'))
    return {key_AWP_kills: AWP_kills}

#### player performance

In [10]:
def player_performance(performance):
    highlighted_players = performance.find('div', attrs='player-overview')      \
                                .find_all('div', attrs='highlighted-player')

    # 
    performance_table = {}
    for player in highlighted_players:
        row = {}

        # performance items
        fact = player.find('div', attrs='facts')
        json_data = json.loads(fact.div['data-fusionchart-config'])
        for item in json_data['dataSource']['data']:
            row[item['label']] = item['value']

        ## player_id
        player_id = '0'
        headline = player.find('div', attrs='headline')
        str = headline.a['href']
        for e in str.split('/'):
            if e.isnumeric():
                player_id = int(e)
        performance_table[player_id] = row

    # debug only
#    display(pd.read_json(json.dumps(performance_table), orient='index'))        
    return {key_perf_table: performance_table} 

In [11]:
def player_map_performance_get(performance):
    res_d = {}
    res_d.update(AWP_kills(performance))
    res_d.update(player_performance(performance))
    return res_d

#### player stat.

In [12]:
# parse player stat.
def player_stat(player_tr):
    items = ['player', 'kills', 'assists', 'deaths', 'kdratio', 'kddiff', 'adr', 'fkdiff', 'adr']
    player_st = {}
    # parse all as text for now.
    for item in items:
        player_st[item] = player_tr.find('td', attrs={"class": "st-" + item}).text

    # player
    st_player = player_tr.find('td', attrs={"class": "st-player"})
    player_id = get_player_id(player_tr.a)
    return {player_id : player_st}

def team_stat_scrape(st_table):
    # 
    teamname = st_table.find("th", attrs={"class": "st-teamname"}).text

    # team member st.
    player_stat_table = {}
    for tr in st_table.find("tbody").find_all("tr"):
        player_stat_table.update(player_stat(tr))

    # build team/teammate map
    teammates = {}
    for id in player_stat_table:
        player = {id: player_stat_table[id]['player']}
        teammates.update(player)
    
    # {teamname: {{id: name}, .., {id: name}}
    team = {teamname : teammates}
    return team, player_stat_table

def stats_table_scrape(stats_table):
    ### team
    teams = []
    player_stat_table = {}
    for stat_table in stats_table:
        team, player_st = team_stat_scrape(stat_table)
        teams.append(team)
        player_stat_table.update(player_st)

#    display(pd.read_json(json.dumps(teams[0]), orient='index'))
#    display(pd.read_json(json.dumps(teams[1]), orient='index'))
#    display(pd.read_json(json.dumps(player_stat_table), orient='index'))
    return teams, player_stat_table

### entrance of match scrape

In [13]:
def match_scrape(brower, match_info):
    # hold all output scrape results.
    match_result = {key_timestamp : match_info['ts']}

    # performance page
    sl = match_info['href'].split('/')
    sl.insert(3, 'performance')
    performance_url = '/'.join(sl)
    
    # match performace main page
    browser.visit(hltv_url + performance_url)
    browser.is_element_present_by_css("div.stats-match-performance", wait_time=3)
    performance = BeautifulSoup(browser.html, 'html.parser')            \
                .find('div', attrs={'class': 'stats-match-performance'})

    ## the match maps
    as_ = []
    match_maps = performance.find('div', attrs={'class': 'stats-match-maps'})
    if match_maps:
        ## now find the each map stats.
        as_ = match_maps.find_all('a', href=True)

    map_st_list = []
    if len(as_) == 0:
        print(f'Single map match')
        map_st = {}
        map_st['map'] = match_info['map']
        map_st['stat'] = player_map_performance_get(performance)
        map_st_list.append(map_st)
        ## 
        match_url = performance_url.replace('performance/', '')

    else:
        print(f'Multi-map match')
        perf_mapstat = []
        for a in as_:
            mapstat = {}
            mapstat['href'] = a['href']
            mapstat['map'] = a.find('div', attrs={'class': 'dynamic-map-name-full'}).text
            perf_mapstat.append(mapstat)
        print(mapstat)
        
        # save one webpage visit.
        for p_st in perf_mapstat:
            if (p_st["href"] != performance_url):
                continue
            print()
            print(f'map = {p_st["map"]}')
            map_st = {}
            map_st['map'] = p_st['map']
            map_st['stat'] = player_map_performance_get(performance)
            map_st_list.append(map_st)
            
        # visit other pages
        for p_st in perf_mapstat:
            if (p_st["href"] == performance_url):
                continue
            browser.visit(hltv_url + p_st["href"])
            browser.is_element_present_by_tag('div.stats-match-performance', wait_time=3)
            performance = BeautifulSoup(browser.html, 'html.parser')          \
                        .find('div', attrs={'class': 'stats-match-performance'})
            print()
            print(f'map = {p_st["map"]}:')
            map_st = {}
            map_st['map'] = p_st['map']
            map_st['stat'] = player_map_performance_get(performance)
            map_st_list.append(map_st)        
        ## 
        match_url = ''
        for p_st in perf_mapstat:
            if 'mapstatsid' not in p_st['href']:
                match_url = p_st['href'].replace('performance/', '')

    ## add to scrape result
    match_result[key_map_stats] = map_st_list
                
    ## match player stat.
    print(match_url)
    browser.visit(hltv_url + match_url)
    browser.is_element_present_by_tag('table.stats-table', wait_time=3)
    # Parse the HTML
    stats_table = BeautifulSoup(browser.html, 'html.parser')           \
                       .find_all("table", attrs={"class": "stats-table"})
    teams, player_st = stats_table_scrape(stats_table)

    ## add to scrape result
    match_result[key_teams] = teams
    match_result[key_player_st] = player_st
    
    return match_result

In [14]:
match_scratch_results = []
for match_info in match_urls:
    match_result = match_scrape(browser, match_info)
    match_scratch_results.append(match_result)

Single map match
/stats/matches/mapstatsid/104026/new-england-whalers-vs-levitate
Multi-map match
{'href': '/stats/matches/performance/mapstatsid/103983/navi-junior-vs-espada', 'map': 'Dust2'}

map = Inferno

map = Best of 5:

map = Overpass:

map = Dust2:
/stats/matches/74713/espada-vs-navi-junior


In [15]:
browser.quit()

## Scrape is all done

In [16]:
def display_scrape_result(match_result):
    # match time
    t_str = datetime.utcfromtimestamp(match_result[key_timestamp]).strftime('%Y-%m-%d')
    print(f'match hold on {t_str}')
    
    # teams in the match
    for team in match_result[key_teams]:
        display(pd.read_json(json.dumps(team), orient='index'))

    # player match stat:
    print(f'players match stat:')
    player_match_st = match_result[key_player_st]
    display(pd.read_json(json.dumps(player_match_st), orient='index'))

    # map stat
    for map_st in match_result[key_map_stats]:
        # map
        map = map_st['map']
        # stat
        stat = map_st['stat']
        print(f"map '{map}' AWP_killing matrix:")
        display(pd.read_json(json.dumps(stat[key_AWP_kills]), orient='index'))
        print(f"map '{map}' performace table:")
        display(pd.read_json(json.dumps(stat[key_perf_table]), orient='index'))
        print()

    

### result decode

In [19]:
res = match_scratch_results[0]
print(res)

{'ts': 1591920300.0, 'map_stats': [{'map': 'Dust2', 'stat': {'AWP_kills': {'15090': {'16546': '2:0', '16453': '4:0', '16902': '2:2', '16647': '0:0', '17372': '3:0'}, '13250': {'16546': '0:0', '16453': '0:0', '16902': '0:3', '16647': '0:0', '17372': '0:0'}, '18115': {'16546': '0:0', '16453': '0:0', '16902': '1:3', '16647': '0:0', '17372': '0:0'}, '12102': {'16546': '0:0', '16453': '0:0', '16902': '0:3', '16647': '0:0', '17372': '0:0'}, '16318': {'16546': '0:0', '16453': '0:0', '16902': '0:1', '16647': '0:0', '17372': '0:0'}}, 'perf_table': {17372: {'KPR': '2.07', 'DPR': '1.16', 'KAST': '1.09', 'Impact': '2.01', 'ADR': '1.98', 'Rating 2.0': '1.66'}, 16647: {'KPR': '1.20', 'DPR': '1.18', 'KAST': '1.75', 'Impact': '1.06', 'ADR': '1.19', 'Rating 2.0': '1.28'}, 16546: {'KPR': '1.18', 'DPR': '1.39', 'KAST': '0.95', 'Impact': '0.91', 'ADR': '1.14', 'Rating 2.0': '1.11'}, 16453: {'KPR': '0.82', 'DPR': '1.08', 'KAST': '1.27', 'Impact': '1.16', 'ADR': '1.02', 'Rating 2.0': '1.07'}, 16902: {'KPR':

In [20]:
display_scrape_result(res)

match hold on 2020-06-12


Unnamed: 0,17372,16647,16546,16453,16902
Levitate,FaNg,Bwills,Sneaky,SPAMMER,KmZ


Unnamed: 0,15090,12102,16318,18115,13250
New England Whalers,PwnAlone,djay,BOOBIE,Rampage,ben1337


players match stat:


Unnamed: 0,player,kills,assists,deaths,kdratio,kddiff,adr,fkdiff
17372,FaNg,30 (22),1 (0),18,69.2%,12,115.0,3
16647,Bwills,20 (10),3 (0),18,92.3%,2,80.9,1
16546,Sneaky,20 (13),1 (0),15,65.4%,5,76.9,0
16453,SPAMMER,14 (10),13 (4),18,76.9%,-4,72.8,1
16902,KmZ,15 (2),5 (1),17,76.9%,-2,64.3,-1
15090,PwnAlone,19 (4),4 (0),21,73.1%,-2,85.3,0
12102,djay,22 (12),3 (3),17,65.4%,5,74.5,-1
16318,BOOBIE,17 (9),6 (0),20,57.7%,-3,79.1,-1
18115,Rampage,15 (10),6 (1),22,76.9%,-7,83.2,-1
13250,ben1337,13 (7),6 (3),20,61.5%,-7,55.4,-1


map 'Dust2' AWP_killing matrix:


Unnamed: 0,16546,16453,16902,16647,17372
15090,2:0,4:0,2:2,0:0,3:0
13250,0:0,0:0,0:3,0:0,0:0
18115,0:0,0:0,1:3,0:0,0:0
12102,0:0,0:0,0:3,0:0,0:0
16318,0:0,0:0,0:1,0:0,0:0


map 'Dust2' performace table:


Unnamed: 0,KPR,DPR,KAST,Impact,ADR,Rating 2.0
17372,2.07,1.16,1.09,2.01,1.98,1.66
16647,1.2,1.18,1.75,1.06,1.19,1.28
16546,1.18,1.39,0.95,0.91,1.14,1.11
16453,0.82,1.08,1.27,1.16,1.02,1.07
16902,0.81,1.22,1.27,0.73,0.82,0.97
15090,1.16,0.69,1.14,1.13,1.22,1.07
12102,1.25,1.15,0.95,0.91,1.0,1.05
16318,0.89,0.72,0.79,1.15,1.1,0.93
18115,0.92,0.55,1.25,0.56,1.19,0.89
13250,0.65,0.79,0.84,0.77,0.65,0.74





In [21]:
res = match_scratch_results[1]
print(res)

{'ts': 1591884000.0, 'map_stats': [{'map': 'Inferno', 'stat': {'AWP_kills': {'17008': {'19673': '0:0', '17305': '0:0', '9081': '0:0', '17306': '0:0', '8125': '0:0'}, '18225': {'19673': '0:0', '17305': '0:0', '9081': '0:0', '17306': '1:0', '8125': '0:0'}, '16871': {'19673': '2:0', '17305': '3:0', '9081': '2:0', '17306': '2:1', '8125': '2:0'}, '18987': {'19673': '0:0', '17305': '0:0', '9081': '0:0', '17306': '0:1', '8125': '0:0'}, '14175': {'19673': '0:0', '17305': '0:0', '9081': '0:2', '17306': '1:1', '8125': '0:0'}}, 'perf_table': {8125: {'KPR': '1.22', 'DPR': '0.77', 'KAST': '1.19', 'Impact': '1.14', 'ADR': '1.21', 'Rating 2.0': '1.11'}, 19673: {'KPR': '0.74', 'DPR': '0.66', 'KAST': '0.58', 'Impact': '0.48', 'ADR': '0.87', 'Rating 2.0': '0.67'}, 9081: {'KPR': '0.59', 'DPR': '0.45', 'KAST': '0.59', 'Impact': '0.87', 'ADR': '0.73', 'Rating 2.0': '0.65'}, 17306: {'KPR': '0.57', 'DPR': '0.41', 'KAST': '0.49', 'Impact': '0.56', 'ADR': '0.61', 'Rating 2.0': '0.53'}, 17305: {'KPR': '0.18', '

In [22]:
display_scrape_result(res)

match hold on 2020-06-11


Unnamed: 0,8125,17306,9081,19673,17305
Espada,Dima,degster,S0tF1k,Patsi,FinigaN


Unnamed: 0,16871,18987,14175,17008,18225
NAVI Junior,Gospadarov,B1T,Aunkere,Topa,KAPACHO


players match stat:


Unnamed: 0,player,kills,assists,deaths,kdratio,kddiff,adr,fkdiff
8125,Dima,55 (35),12 (2),52,72.7%,3,75.7,2
17306,degster,51 (20),16 (4),54,66.2%,-3,79.8,4
9081,S0tF1k,47 (30),18 (7),55,62.3%,-8,67.8,-4
19673,Patsi,51 (24),10 (2),57,63.6%,-6,74.4,-1
17305,FinigaN,31 (18),13 (9),50,59.7%,-19,44.5,-4
16871,Gospadarov,60 (25),23 (14),49,75.3%,11,83.3,6
18987,B1T,64 (49),10 (6),47,68.8%,17,85.5,-4
14175,Aunkere,54 (31),9 (2),48,81.8%,6,77.6,5
17008,Topa,52 (35),16 (9),45,75.3%,7,70.4,-4
18225,KAPACHO,37 (25),14 (2),48,77.9%,-11,63.4,0


map 'Inferno' AWP_killing matrix:


Unnamed: 0,19673,17305,9081,17306,8125
17008,0:0,0:0,0:0,0:0,0:0
18225,0:0,0:0,0:0,1:0,0:0
16871,2:0,3:0,2:0,2:1,2:0
18987,0:0,0:0,0:0,0:1,0:0
14175,0:0,0:0,0:2,1:1,0:0


map 'Inferno' performace table:


Unnamed: 0,KPR,DPR,KAST,Impact,ADR,Rating 2.0
8125,1.22,0.77,1.19,1.14,1.21,1.11
19673,0.74,0.66,0.58,0.48,0.87,0.67
9081,0.59,0.45,0.59,0.87,0.73,0.65
17306,0.57,0.41,0.49,0.56,0.61,0.53
17305,0.18,0.67,0.44,0.38,0.23,0.38
16871,1.96,1.77,1.47,1.9,1.54,1.73
17008,1.35,2.06,1.89,1.37,1.37,1.61
18987,1.48,1.44,1.11,1.35,1.43,1.36
14175,1.26,1.59,1.47,1.11,1.18,1.32
18225,0.79,1.96,1.74,0.94,0.9,1.26



map 'Best of 5' AWP_killing matrix:


Unnamed: 0,19673,17305,9081,17306,8125
17008,0:0,0:0,0:0,0:2,0:0
18225,0:0,0:0,0:0,1:5,1:0
16871,6:0,6:1,6:1,6:5,4:0
18987,0:0,0:0,0:0,1:4,0:0
14175,0:0,0:0,1:2,2:4,0:0


map 'Best of 5' performace table:


Unnamed: 0,KPR,DPR,KAST,Impact,ADR,Rating 2.0
8125,1.13,1.07,1.14,1.28,1.03,1.13
17306,1.04,0.99,0.96,1.23,1.11,1.07
9081,0.92,0.95,0.87,1.08,0.88,0.94
19673,0.97,0.91,0.9,0.9,1.0,0.94
17305,0.48,1.2,0.8,0.67,0.46,0.72
16871,1.33,1.22,1.21,1.18,1.19,1.23
18987,1.32,1.23,1.03,1.19,1.24,1.2
14175,1.09,1.35,1.4,1.05,1.08,1.19
17008,0.98,1.43,1.21,1.14,0.93,1.14
18225,0.68,1.28,1.28,0.72,0.79,0.95



map 'Overpass' AWP_killing matrix:


Unnamed: 0,19673,17305,9081,17306,8125
17008,0:0,0:0,0:0,0:2,0:0
18225,0:0,0:0,0:0,0:3,1:0
16871,1:0,2:1,0:0,1:3,0:0
18987,0:0,0:0,0:0,0:2,0:0
14175,0:0,0:0,0:0,0:1,0:0


map 'Overpass' performace table:


Unnamed: 0,KPR,DPR,KAST,Impact,ADR,Rating 2.0
9081,1.9,1.52,1.34,1.71,1.84,1.66
17306,1.25,1.58,1.22,1.24,0.97,1.25
8125,0.95,1.11,1.25,1.03,0.94,1.06
19673,0.81,0.98,0.89,0.55,1.1,0.87
17305,0.56,1.45,1.06,0.54,0.46,0.81
14175,1.26,1.24,1.33,1.01,1.19,1.21
16871,1.04,1.0,1.18,0.95,1.12,1.06
18987,0.9,1.21,0.99,0.92,0.75,0.95
17008,0.94,0.86,0.87,0.81,1.06,0.91
18225,0.52,0.83,0.96,0.56,0.65,0.71



map 'Dust2' AWP_killing matrix:


Unnamed: 0,19673,17305,9081,17306,8125
17008,0:0,0:0,0:0,0:0,0:0
18225,0:0,0:0,0:0,0:2,0:0
16871,3:0,1:0,4:1,3:1,2:0
18987,0:0,0:0,0:0,1:1,0:0
14175,0:0,0:0,1:0,1:2,0:0


map 'Dust2' performace table:


Unnamed: 0,KPR,DPR,KAST,Impact,ADR,Rating 2.0
17306,1.32,0.98,1.24,1.74,1.82,1.42
8125,1.29,1.27,1.04,1.54,1.08,1.24
19673,1.33,1.06,1.23,1.48,1.07,1.24
17305,0.74,1.44,1.07,0.95,0.73,0.99
9081,0.51,0.89,0.8,0.62,0.43,0.65
18987,1.58,1.08,1.03,1.27,1.55,1.3
14175,0.88,1.23,1.43,1.01,0.98,1.1
17008,0.78,1.39,1.04,1.18,0.58,0.99
16871,1.1,0.94,1.06,0.76,1.0,0.97
18225,0.73,1.09,1.22,0.66,0.83,0.91



