In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
import json
import time
from datetime import datetime

In [2]:
# return a filename (with path) such that it is accessible. 
def get_fullname(filename):
    # as long as it is accessible.
    if os.path.isfile(filename):
        return filename

    # search current working directory.
    cur_working_dir = os.getcwd()
    for i in range(50):
        for (path, dir, files) in os.walk(cur_working_dir):
            if filename in files:
                return os.path.join(path, filename)

        # check parent directory
        parent = os.path.dirname(cur_working_dir)
        if cur_working_dir == parent:
            break;
        cur_working_dir = parent

    # did not found, simply return.
    print(f"file {filename} not found !!!")
    return filename


# Initiate headless driver for deployment
browser = Browser("chrome", executable_path=get_fullname("chromedriver.exe"), headless=True)
time.sleep(1)
#browser = Browser('chrome', executable_path=get_fullname("chromedriver.exe"))

In [3]:
# base url
hltv_url = 'https://www.hltv.org'

## visist '/stats/matches'

In [4]:
# Visit the site to Scrape
next_url = '/stats/matches'
match_urls = []
for cnt in range(2000):
    # Visit the site to Scrape
    browser.visit(hltv_url + next_url)
    browser.is_element_present_by_tag('div.contentCol', wait_time=3)
    content = BeautifulSoup(browser.html, 'html.parser').find('div', attrs={"class": "contentCol"})

    # all matches in this page
    trs = content.tbody.find_all('tr')
    
    for tr in trs:
        match_info= {} 
        
        # data-unix
        td = tr.find('td', attrs={'class': 'date-col'})
        match_info['href'] = td.a['href']
        t = int(td.div['data-unix'])/1000
        match_info['ts'] = t
        t_str = datetime.utcfromtimestamp(t).strftime('%Y-%m-%d')
        # teams
        tds = tr.find_all('td', attrs={'class': 'team-col'})
        match_info['team1'] = tds[0].a.text
        match_info['team2'] = tds[1].a.text
        # map
        match_info['map'] = tr.find('div', attrs={'class': 'dynamic-map-name-full'}).text
        match_urls.append(match_info)        

    # ========================================================================
    # next page
    # ========================================================================
    # next page
    a = content.find('a', attrs={"class": "pagination-next"}, href=True)
    if not a:
        print(f'Done {len(match_urls)} matches scraping.')
        break
    next_url = a['href']
    
    # only during development
    if cnt == 10:
        print(f'Truncate the match lists !!!!!')
        print(f'Change next_url from: {next_url}')        
        next_url = '/stats/matches?offset=79700'
        print(f'                  to: {next_url}')   

Truncate the match lists !!!!!
Change next_url from: /stats/matches?offset=550
                  to: /stats/matches?offset=79700
Done 699 matches scraping.


In [5]:
match_urls[:3]

[{'href': '/stats/matches/mapstatsid/104039/sharks-vs-9z',
  'ts': 1591928400.0,
  'team1': 'Sharks',
  'team2': '9z',
  'map': 'Nuke'},
 {'href': '/stats/matches/mapstatsid/104038/aa-vs-ex-morning-light',
  'ts': 1591927800.0,
  'team1': 'AA',
  'team2': 'ex-Morning Light',
  'map': 'Mirage'},
 {'href': '/stats/matches/mapstatsid/104033/teamone-vs-recon-5',
  'ts': 1591927800.0,
  'team1': 'TeamOne',
  'team2': 'Recon 5',
  'map': 'Nuke'}]

In [6]:
## overwrite with two matches only
match_urls = [
    # single map match
    {'href': '/stats/matches/mapstatsid/104026/new-england-whalers-vs-levitate',
     'ts': 1591920300.0,
     'team1': 'New England Whalers',
     'team2': 'Levitate',
     'map': 'Dust2'},
    # multi-map match
    {'href': '/stats/matches/mapstatsid/103972/espada-vs-navi-junior',
        'ts': 1591884000.0,
     'team1': 'Espada',
     'team2': 'NAVI Junior',
     'map': 'Inferno'}
]
match_urls

[{'href': '/stats/matches/mapstatsid/104026/new-england-whalers-vs-levitate',
  'ts': 1591920300.0,
  'team1': 'New England Whalers',
  'team2': 'Levitate',
  'map': 'Dust2'},
 {'href': '/stats/matches/mapstatsid/103972/espada-vs-navi-junior',
  'ts': 1591884000.0,
  'team1': 'Espada',
  'team2': 'NAVI Junior',
  'map': 'Inferno'}]

#### AWP kills

In [7]:
def AWP_kills(performance):
    AWP_content = performance.find(id='AWP-content')
    trs = AWP_content.table.tbody.find_all('tr')
    print(f'team1 players:', end =" ")
    for td in trs[0].find_all('td', attrs={'class': 'team1'}):
        print(f'{td.a.text},', end =" ")
    print()
    for idx, tr in enumerate(trs):
        if idx == 0:
            continue
        tds = tr.find_all('td')
        scores = []
        for i, td in enumerate(tds):
            if (i == 0):
                player = td.a.text
            else:
                scores.append(td.text)
        print(f'team2 player = {player}, scores = {scores}')

#### player performance

In [8]:
def player_performance(performance):
    highlighted_players = performance.find('div', attrs='player-overview')      \
                                .find_all('div', attrs='highlighted-player')

    for player in highlighted_players:
        player_performance = {}
        # player_name
        p_and_c = player.find('div', attrs='picture-and-chart')
        player_performance['name'] = p_and_c.div.img['title']

        player_id = 0
        str = p_and_c.div.img['src']
        for e in str.split('/'):
            if e.isnumeric():
                player_id = int(e)
        player_performance['player_id'] = player_id

        # performance items
        fact = player.find('div', attrs='facts')
        json_data = json.loads(fact.div['data-fusionchart-config'])

        for item in json_data['dataSource']['data']:
            player_performance[item['label']] = item['value']
        print(player_performance)    

In [9]:
def player_map_performance_get(performance):
    AWP_kills(performance)
    player_performance(performance)

#### player stat.

In [10]:
# parse player stat.
def player_stat(player_tr):
    items = ['player', 'kills', 'assists', 'deaths', 'kdratio', 'kddiff', 'adr', 'fkdiff', 'adr']
    player_st = {}
    # parse all as text for now.
    for item in items:
        player_st[item] = player_tr.find('td', attrs={"class": "st-" + item}).text
    return player_st

def team_stat(st_table):
    team_st = {}
    team_st['teamname'] = st_table.find("th", attrs={"class": "st-teamname"}).text
    
    # team member st.
    teammates = []
    for tr in st_table.find("tbody").find_all("tr"):
        teammates.append(player_stat(tr))
    team_st['teammates'] = teammates
    return team_st

### entrance of match scrape

In [11]:
def match_scrape(brower, match_info):
    # performance page
    sl = match_info['href'].split('/')
    sl.insert(3, 'performance')
    performance_url = '/'.join(sl)
    
    # match performace main page
    browser.visit(hltv_url + performance_url)
    browser.is_element_present_by_css("div.stats-match-performance", wait_time=3)
    performance = BeautifulSoup(browser.html, 'html.parser')            \
                .find('div', attrs={'class': 'stats-match-performance'})

    ## the match maps
    as_ = []
    match_maps = performance.find('div', attrs={'class': 'stats-match-maps'})
    if match_maps:
        ## now find the each map stats.
        as_ = match_maps.find_all('a', href=True)

    if len(as_) == 0:
        print(f'Single map match')
        player_map_performance_get(performance)
        ## 
        match_url = performance_url.replace('performance/', '')

    else:
        print(f'Multi-map match')
        perf_mapstat = []
        for a in as_:
            mapstat = {}
            mapstat['href'] = a['href']
            mapstat['map'] = a.find('div', attrs={'class': 'dynamic-map-name-full'}).text
            perf_mapstat.append(mapstat)
        print(mapstat)
        
        # save one webpage visit.
        for p_st in perf_mapstat:
            if (p_st["href"] != performance_url):
                continue
            print()
            print(f'map = {p_st["map"]}')
            player_map_performance_get(performance)
            
        # visit other pages
        for p_st in perf_mapstat:
            if (p_st["href"] == performance_url):
                continue
            browser.visit(hltv_url + p_st["href"])
            browser.is_element_present_by_tag('div.stats-match-performance', wait_time=3)
            performance = BeautifulSoup(browser.html, 'html.parser')          \
                        .find('div', attrs={'class': 'stats-match-performance'})
            print()
            print(f'map = {p_st["map"]}:')
            player_map_performance_get(performance)
        
        match_url = ''
        for p_st in perf_mapstat:
            if 'mapstatsid' not in p_st['href']:
                match_url = p_st['href'].replace('performance/', '')

    ## match player stat.
    print(match_url)
    browser.visit(hltv_url + match_url)
    browser.is_element_present_by_tag('table.stats-table', wait_time=3)
    # Parse the HTML
    stats_table = BeautifulSoup(browser.html, 'html.parser')           \
                       .find_all("table", attrs={"class": "stats-table"})
    team_st = []
    for stat_table in stats_table:
        team_st.append(team_stat(stat_table))
    print(team_st)

In [12]:
for match_info in match_urls:
    match_scrape(browser, match_info)

Single map match
team1 players: Sneaky, SPAMMER, KmZ, Bwills, FaNg, 
team2 player = PwnAlone, scores = ['2:0', '4:0', '2:2', '0:0', '3:0']
team2 player = ben1337, scores = ['0:0', '0:0', '0:3', '0:0', '0:0']
team2 player = Rampage, scores = ['0:0', '0:0', '1:3', '0:0', '0:0']
team2 player = djay, scores = ['0:0', '0:0', '0:3', '0:0', '0:0']
team2 player = BOOBIE, scores = ['0:0', '0:0', '0:1', '0:0', '0:0']
{'name': "Justin 'FaNg' Coakley", 'player_id': 17372, 'KPR': '2.07', 'DPR': '1.16', 'KAST': '1.09', 'Impact': '2.01', 'ADR': '1.98', 'Rating 2.0': '1.66'}
{'name': "Brendan 'Bwills' Williams", 'player_id': 0, 'KPR': '1.20', 'DPR': '1.18', 'KAST': '1.75', 'Impact': '1.06', 'ADR': '1.19', 'Rating 2.0': '1.28'}
{'name': "David 'Sneaky' Polster", 'player_id': 16546, 'KPR': '1.18', 'DPR': '1.39', 'KAST': '0.95', 'Impact': '0.91', 'ADR': '1.14', 'Rating 2.0': '1.11'}
{'name': "Bilal 'SPAMMER' Ali", 'player_id': 0, 'KPR': '0.82', 'DPR': '1.08', 'KAST': '1.27', 'Impact': '1.16', 'ADR': '1.0


map = Dust2:
team1 players: Patsi, FinigaN, S0tF1k, degster, Dima, 
team2 player = Topa, scores = ['0:0', '0:0', '0:0', '0:0', '0:0']
team2 player = KAPACHO, scores = ['0:0', '0:0', '0:0', '0:2', '0:0']
team2 player = Gospadarov, scores = ['3:0', '1:0', '4:1', '3:1', '2:0']
team2 player = B1T, scores = ['0:0', '0:0', '0:0', '1:1', '0:0']
team2 player = Aunkere, scores = ['0:0', '0:0', '1:0', '1:2', '0:0']
{'name': "Abdul 'degster' Gasanov", 'player_id': 0, 'KPR': '1.32', 'DPR': '0.98', 'KAST': '1.24', 'Impact': '1.74', 'ADR': '1.82', 'Rating 2.0': '1.42'}
{'name': "Dmitriy 'Dima' Bandurka", 'player_id': 0, 'KPR': '1.29', 'DPR': '1.27', 'KAST': '1.04', 'Impact': '1.54', 'ADR': '1.08', 'Rating 2.0': '1.24'}
{'name': "Robert 'Patsi' Isyanov", 'player_id': 0, 'KPR': '1.33', 'DPR': '1.06', 'KAST': '1.23', 'Impact': '1.48', 'ADR': '1.07', 'Rating 2.0': '1.24'}
{'name': "Vladislav 'FinigaN' Usov", 'player_id': 0, 'KPR': '0.74', 'DPR': '1.44', 'KAST': '1.07', 'Impact': '0.95', 'ADR': '0.73', 

In [13]:
browser.quit()