In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
import json
import time
import re
from datetime import datetime

In [2]:
# return a filename (with path) such that it is accessible. 
def get_fullname(filename):
    # as long as it is accessible.
    if os.path.isfile(filename):
        return filename

    # search current working directory.
    cur_working_dir = os.getcwd()
    for i in range(50):
        for (path, dir, files) in os.walk(cur_working_dir):
            if filename in files:
                return os.path.join(path, filename)

        # check parent directory
        parent = os.path.dirname(cur_working_dir)
        if cur_working_dir == parent:
            break;
        cur_working_dir = parent

    # did not found, simply return.
    print(f"file {filename} not found !!!")
    return filename


# Initiate headless driver for deployment
browser = Browser("chrome", executable_path=get_fullname("chromedriver.exe"), headless=True)
#browser = Browser('chrome', executable_path=get_fullname("chromedriver.exe"))
time.sleep(0.1)

In [3]:
# base url
hltv_url = 'https://www.hltv.org'

In [4]:
key_perf_table = 'perf_table'
key_AWP_kills = 'AWP_kills'
## top level
key_timestamp = 'ts'
key_map_stats = 'map_stats'
key_teams = 'teams'
key_player_st = 'match_player_st'

## visist '/stats/matches'

In [5]:
# href="/stats/teams/4471/bemyfRAG"
# href="/stats/players/15090/PwnAlone"
def get_team_id(a):
    sl = a['href'].split('/')
    return sl[3]
get_player_id = get_team_id

def get_team_name(a):
    sl = a['href'].split('/')
    return sl[4]
get_player_name = get_team_name

#### only timestamp, map (if single map) and href are used later.

In [6]:
# Visit the site to Scrape
next_url = '/stats/matches'
match_urls = []
for cnt in range(100):
    # Visit the site to Scrape
    browser.visit(hltv_url + next_url)
    browser.is_element_present_by_tag('div.contentCol', wait_time=3)
    content = BeautifulSoup(browser.html, 'html.parser').find('div', attrs={"class": "contentCol"})

    # all matches in this page
    trs = content.tbody.find_all('tr')
    
    for tr in trs:
        match_info= {} 
        
        # data-unix
        td = tr.find('td', attrs={'class': 'date-col'})
        match_info['href'] = td.a['href']
        t = int(td.div['data-unix'])/1000
        match_info['ts'] = t
        t_str = datetime.utcfromtimestamp(t).strftime('%Y-%m-%d')
        # teams
        tds = tr.find_all('td', attrs={'class': 'team-col'})
        # team1
        match_info['team1'] = tds[0].a.text
        strs = re.findall("\d+", tds[0].span.text)
        match_info['score1'] = strs[0]
        # team2
        match_info['team2'] = tds[1].a.text
        strs = re.findall("\d+", tds[1].span.text)
        match_info['score2'] = strs[0]
        # map
        match_info['map'] = tr.find('div', attrs={'class': 'dynamic-map-name-full'}).text
        match_urls.append(match_info)        

    # ========================================================================
    # next page
    # ========================================================================
    # next page
    a = content.find('a', attrs={"class": "pagination-next"}, href=True)
    if not a:
        print(f'Done {len(match_urls)} matches scraping.')
        break
    next_url = a['href']
    
    # only during development
    if cnt == 10:
        print(f'Truncate the match lists !!!!!')
        print(f'Change next_url from: {next_url}')        
        next_url = '/stats/matches?offset=79700'
        print(f'                  to: {next_url}')   

Truncate the match lists !!!!!
Change next_url from: /stats/matches?offset=550
                  to: /stats/matches?offset=79700
Done 1058 matches scraping.


In [7]:
match_urls[:3]

[{'href': '/stats/matches/mapstatsid/104462/pact-vs-movistar-riders',
  'ts': 1592492400.0,
  'team1': 'PACT',
  'score1': '16',
  'team2': 'Movistar Riders',
  'score2': '12',
  'map': 'Nuke'},
 {'href': '/stats/matches/mapstatsid/104461/forze-vs-gambit-youngsters',
  'ts': 1592492400.0,
  'team1': 'forZe',
  'score1': '8',
  'team2': 'Gambit Youngsters',
  'score2': '16',
  'map': 'Overpass'},
 {'href': '/stats/matches/mapstatsid/104460/ago-vs-mens',
  'ts': 1592492400.0,
  'team1': 'AGO',
  'score1': '16',
  'team2': 'mens',
  'score2': '2',
  'map': 'Nuke'}]

In [8]:
match_urls = match_urls[:100]

#### player stat.

In [9]:
def percentage_get_(str):
    # 'nn.nn%'
    nums = re.findall('(\d*\.\d*\%)', str)
    if (nums):
        return float(nums[0].strip('%'))/100
    # 'nn.nn'
    nums = re.findall('(\d*\.\d*)', str)
    if (nums):
        return float(nums[0])
    # 'nn'
    nums = re.findall('(\d+)', str)    
    if (nums):
        return float(nums[0])
    return 0

# kratio
def kdratio_get(str):
    return round(percentage_get_(str), 4)

# assists
def assists_get(str):
    nums = re.findall("\d+", str)
    if len(nums) == 0:
        return '0', '0'
    elif len(nums) == 1:
        return nums[0], '0'
    return nums[0], nums[1]
# kills
def kills_get(str):
    return assists_get(str)

# mostly for 'adr'
def num_get(str):
    nums = re.findall("\d+", str)
    if len(nums):
        return nums[0]
    return '0'

In [10]:
# parse player stat.
def player_stat(player_tr):
#    items = ['player', 'kills', 'assists', 'deaths', 'kdratio', 'kddiff', 'adr', 'fkdiff', 'adr']
    player_st = {}
    # player
    st_player = player_tr.find('td', attrs={"class": "st-player"})
    player_id = get_player_id(st_player.a)
    player_st['player_id'] = player_id
    player_st['player'] = get_player_name(st_player.a) # st_player.text
    # parse 'kills'
    str = player_tr.find('td', attrs={"class": "st-kills"}).text
    player_st['kills'], player_st['hs'] = kills_get(str)
    # parse 'assists'
    str = player_tr.find('td', attrs={"class": "st-assists"}).text
    player_st['assists'], player_st['flash_assists'] = assists_get(str)
        
    # parse 'kdratio', remove '%' to float.
    player_st['kdratio'] = kdratio_get(str)
    # 'adr'
    item = 'adr'
    str = player_tr.find('td', attrs={"class": "st-" + item}).text
    player_st[item] = num_get(str)
    # parse all others.
    items = ['deaths', 'kddiff', 'fkdiff', 'rating']
    for item in items:
        player_st[item] = player_tr.find('td', attrs={"class": "st-" + item}).text

    return {player_id : player_st}

def team_stat_scrape(st_table):
    # 
    teamname = st_table.find("th", attrs={"class": "st-teamname"}).text

    # team member st.
    player_stat_table = {}
    for tr in st_table.find("tbody").find_all("tr"):
        player_stat_table.update(player_stat(tr))

    # build team/teammate map
    teammates = {}
    for id_ in player_stat_table:
        # insert 'teamname'
        player_stat_table[id_]['teamname'] = teamname
        player = {id_: player_stat_table[id_]['player']}
        teammates.update(player)
    
    # {teamname: {{id: name}, .., {id: name}}
    team = {teamname : teammates}
    return team, player_stat_table

def stats_table_scrape(stats_table):
    ### team
    teams = []
    player_stat_table = {}
    # print(stats_table)
    for stat_table in stats_table:
        team, player_st = team_stat_scrape(stat_table)
        teams.append(team)
        player_stat_table.update(player_st)

    #display(pd.read_json(json.dumps(teams[0]), orient='index'))
    #display(pd.read_json(json.dumps(teams[1]), orient='index'))
    #display(pd.read_json(json.dumps(player_stat_table), orient='index'))
    return teams, player_stat_table

def map_match_scrape(contentCol):
    mapstat = {}
    # mapstatid
    stats_table = contentCol.find_all('table', attrs={'class': 'stats-table'})
    teams, player_st = stats_table_scrape(stats_table)
    mapstat['team'] = teams
    mapstat['stat'] = player_st
    return mapstat

### entrance of match scrape

In [11]:
  
def match_scrape(brower, match_info):
    match_result = {}
    # hold all output scrape results.
    match_result.update(match_info)
    print(match_result)

    # match performace main page
    browser.visit(hltv_url + match_info['href'])
    browser.is_element_present_by_css("div.contentCol", wait_time=3)
    time.sleep(0.1)
    contentCol = BeautifulSoup(browser.html, 'html.parser')       \
                 .find('div', attrs={'class': 'contentCol'})

    # 'match_id' from 'match-page-link'
    match_page_link = contentCol.find('a', attrs={'class': 'match-page-link'})
    match_result['match_id'] = re.findall('/(\d+)/', match_page_link['href'])[0]
        
    # get all map's 'href'
    stats_match_maps = contentCol.find('div', attrs={'class': 'stats-match-maps'})
    as_ = []
    if stats_match_maps:
        as_ = stats_match_maps.find_all('a', href=True)

    perf_mapstat = []   
    if len(as_) == 0:
        print(f'single map')
    
        mapstat = {}
        mapstat['map'] = match_info['map']
        left = contentCol.find('div', attrs={'class': 'team-left'})
        right = contentCol.find('div', attrs={'class': 'team-right'})
        mapstat['total_rounds'] = int(left.div.text) + int(right.div.text)
        if (int(left.div.text) >= int(right.div.text)):
            winner = left
        else:
            winner = right
        mapstat['winner'] = get_team_name(winner.a)
        mapstat['winner_id'] = get_team_id(winner.a)
        mapstat['win_rounds'] = winner.div.text

        # mapstatid
        mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", match_info['href'])[0]
        # scrape all others
        mapstat.update(map_match_scrape(contentCol))
        perf_mapstat.append(mapstat)
    else:
        print(f'multi-map match')
        # get all map's 'href', 'score' and 'map'
        match_map_results = []
        for a in as_:
            map_res = {}
            map_res['href'] = a['href']
            scores = a.find('div', attrs={'class': 'stats-match-map-result-score'}).text
            nums = re.findall('\d+', scores)
            map_res['total_rounds'] = int(nums[0]) + int(nums[1])
            if (int(nums[0]) >= int(nums[1])):
                map_res['win_rounds'] = nums[0]
            else:
                map_res['win_rounds'] = nums[1]
            map_res['map'] = a.find('div', attrs={'class': 'dynamic-map-name-full'}).text

            winner = a.find('div', attrs={'class': 'stats-match-map-winner-logo-con'})
            # timing issue here !!!!
            #print(f'winner {winner}')
            map_res['winner_id'] = re.findall("\d+", winner.img['src'])[0]
            map_res['winner'] = winner.img['title']
            match_map_results.append(map_res)

        # scrape current webpage, save one visit
        for map_res in match_map_results:
            if map_res['href'] != match_info['href']:
                continue
            print(f'current page {map_res["href"]}')    
            mapstat = {}
            mapstat.update(map_res)
            # mapstatid
            mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", map_res['href'])[0]
            # scrape all others
            mapstat.update(map_match_scrape(contentCol))
            perf_mapstat.append(mapstat)

        # scrape all other webpages.
        for map_res in match_map_results:
            if map_res['href'] == match_info['href']:
                continue
            if 'mapstatsid' not in map_res['href']:
                continue

            # match performace main page
            browser.visit(hltv_url + map_res['href'])
            browser.is_element_present_by_css("div.contentCol", wait_time=3)
            contentCol = BeautifulSoup(browser.html, 'html.parser')       \
                             .find('div', attrs={'class': 'contentCol'})                

            print(f"'visit {map_res['href']}")            
            mapstat = {}
            mapstat.update(map_res)
            # mapstatid
            mapstat['mapstatid'] = re.findall("/mapstatsid/(\d+)/", map_res['href'])[0]
            # scrape all others
            mapstat.update(map_match_scrape(contentCol))
            perf_mapstat.append(mapstat)

    match_result['mapstat'] = perf_mapstat
    return match_result

## convert scrape result to pd

In [12]:
def df_padding(df, column_name, pad):
    df[column_name] = [pad] * df.shape[0]
    return df

def match_stat_df_get(match_res):
    # define the order.
    column_names = ['player_id','player','match_id','date',
                'kills', 'hs', 'assists', 'flash_assists', 'kdratio', 'deaths', 'kddiff', 'adr', 'fkdiff']
    player_match_df = pd.DataFrame(columns=column_names)
    
    for mapstat in match_res['mapstat']:
        player_map_df = pd.read_json(json.dumps(mapstat['stat']), orient='index')
        # padding
        player_map_df = df_padding(player_map_df, 'match_id', match_res['match_id'])
        player_map_df = df_padding(player_map_df, 'winner', mapstat['winner'])
        player_map_df = df_padding(player_map_df, 'winner_id', mapstat['winner_id'])
        
        player_map_df = df_padding(player_map_df, 'total_rounds', mapstat['total_rounds'])
        player_map_df = df_padding(player_map_df, 'mapstatid', mapstat['mapstatid'])
        
        player_map_df = df_padding(player_map_df, 'win_rounds', mapstat['win_rounds'])
        player_map_df = df_padding(player_map_df, 'map', mapstat['map'])
        
        t_str = datetime.utcfromtimestamp(match_res[key_timestamp]).strftime('%Y-%m-%d')
        player_map_df = df_padding(player_map_df, 'date', t_str)

        player_match_df = pd.concat([player_match_df, player_map_df])
    return player_match_df

## main routine

In [13]:
output_df_column_names = [
     'player_id',
     'player',
     'match_id',
     'date',
     'kills',
     'hs',
     'assists',
     'flash_assists',
     'kdratio',
     'deaths',
     'kddiff',
     'adr',
     'fkdiff',
     'rating',
     'teamname',
     'winner',
     'winner_id',
     'total_rounds',
     'mapstatid',
     'win_rounds',
     'map']

In [14]:
start_time = datetime.now()
print("Start Time =", start_time.strftime("%H:%M:%S"))

Start Time = 09:30:54


In [15]:
# create an empty file
filename = 'player_mapstat.csv'
df = pd.DataFrame(columns=output_df_column_names)
df.to_csv(filename, index=False)
# append the file.
for match_result in match_urls:
    res = match_scrape(browser, match_result)
    df = match_stat_df_get(res)
    df[output_df_column_names].to_csv(filename, mode='a', header=False)

{'href': '/stats/matches/mapstatsid/104462/pact-vs-movistar-riders', 'ts': 1592492400.0, 'team1': 'PACT', 'score1': '16', 'team2': 'Movistar Riders', 'score2': '12', 'map': 'Nuke'}
single map
{'href': '/stats/matches/mapstatsid/104461/forze-vs-gambit-youngsters', 'ts': 1592492400.0, 'team1': 'forZe', 'score1': '8', 'team2': 'Gambit Youngsters', 'score2': '16', 'map': 'Overpass'}
single map
{'href': '/stats/matches/mapstatsid/104460/ago-vs-mens', 'ts': 1592492400.0, 'team1': 'AGO', 'score1': '16', 'team2': 'mens', 'score2': '2', 'map': 'Nuke'}
single map
{'href': '/stats/matches/mapstatsid/104463/natus-vincere-vs-complexity', 'ts': 1592490600.0, 'team1': 'Natus Vincere', 'score1': '8', 'team2': 'Complexity', 'score2': '16', 'map': 'Dust2'}
single map
{'href': '/stats/matches/mapstatsid/104458/natus-vincere-vs-complexity', 'ts': 1592490600.0, 'team1': 'Natus Vincere', 'score1': '16', 'team2': 'Complexity', 'score2': '5', 'map': 'Nuke'}
single map
{'href': '/stats/matches/mapstatsid/10445

single map
{'href': '/stats/matches/mapstatsid/104428/buffdaddys-paypal-vs-levitate', 'ts': 1592438400.0, 'team1': "Buffdaddy's Paypal", 'score1': '1', 'team2': 'Levitate', 'score2': '16', 'map': 'Dust2'}
single map
{'href': '/stats/matches/mapstatsid/104427/evil-geniuses-vs-liquid', 'ts': 1592429400.0, 'team1': 'Evil Geniuses', 'score1': '16', 'team2': 'Liquid', 'score2': '6', 'map': 'Inferno'}
multi-map match
current page /stats/matches/mapstatsid/104427/evil-geniuses-vs-liquid
'visit /stats/matches/mapstatsid/104425/liquid-vs-evil-geniuses
'visit /stats/matches/mapstatsid/104426/evil-geniuses-vs-liquid
{'href': '/stats/matches/mapstatsid/104426/evil-geniuses-vs-liquid', 'ts': 1592429400.0, 'team1': 'Evil Geniuses', 'score1': '16', 'team2': 'Liquid', 'score2': '8', 'map': 'Dust2'}
multi-map match
current page /stats/matches/mapstatsid/104426/evil-geniuses-vs-liquid
'visit /stats/matches/mapstatsid/104425/liquid-vs-evil-geniuses
'visit /stats/matches/mapstatsid/104427/evil-geniuses-vs

'visit /stats/matches/mapstatsid/104394/hellraisers-vs-nemiga
{'href': '/stats/matches/mapstatsid/104397/spirit-vs-virtuspro', 'ts': 1592395200.0, 'team1': 'Spirit', 'score1': '22', 'team2': 'Virtus.pro', 'score2': '19', 'map': 'Train'}
multi-map match
current page /stats/matches/mapstatsid/104397/spirit-vs-virtuspro
'visit /stats/matches/mapstatsid/104390/spirit-vs-virtuspro
'visit /stats/matches/mapstatsid/104393/virtuspro-vs-spirit
{'href': '/stats/matches/mapstatsid/104393/virtuspro-vs-spirit', 'ts': 1592395200.0, 'team1': 'Virtus.pro', 'score1': '16', 'team2': 'Spirit', 'score2': '5', 'map': 'Overpass'}
multi-map match
current page /stats/matches/mapstatsid/104393/virtuspro-vs-spirit
'visit /stats/matches/mapstatsid/104390/spirit-vs-virtuspro
'visit /stats/matches/mapstatsid/104397/spirit-vs-virtuspro
{'href': '/stats/matches/mapstatsid/104390/spirit-vs-virtuspro', 'ts': 1592395200.0, 'team1': 'Spirit', 'score1': '16', 'team2': 'Virtus.pro', 'score2': '14', 'map': 'Dust2'}
multi-m

multi-map match
current page /stats/matches/mapstatsid/104355/cr4zy-vs-syman
'visit /stats/matches/mapstatsid/104360/syman-vs-cr4zy
'visit /stats/matches/mapstatsid/104363/syman-vs-cr4zy
{'href': '/stats/matches/mapstatsid/104362/lyngby-vikings-vs-ambush', 'ts': 1592332200.0, 'team1': 'Lyngby Vikings', 'score1': '8', 'team2': 'Ambush', 'score2': '16', 'map': 'Dust2'}
multi-map match
current page /stats/matches/mapstatsid/104362/lyngby-vikings-vs-ambush
'visit /stats/matches/mapstatsid/104356/lyngby-vikings-vs-ambush
'visit /stats/matches/mapstatsid/104359/ambush-vs-lyngby-vikings
{'href': '/stats/matches/mapstatsid/104359/ambush-vs-lyngby-vikings', 'ts': 1592332200.0, 'team1': 'Ambush', 'score1': '8', 'team2': 'Lyngby Vikings', 'score2': '16', 'map': 'Inferno'}
multi-map match
current page /stats/matches/mapstatsid/104359/ambush-vs-lyngby-vikings
'visit /stats/matches/mapstatsid/104356/lyngby-vikings-vs-ambush
'visit /stats/matches/mapstatsid/104362/lyngby-vikings-vs-ambush
{'href': '/

In [16]:
now = datetime.now()
print("End Time =", now.strftime("%H:%M:%S"))
sec = (now-start_time).total_seconds()
print(f'Total time {sec}')

End Time = 09:35:50
Total time 296.522404


In [17]:
# show the data
ddf = pd.read_csv(filename)
ddf

Unnamed: 0,player_id,player,match_id,date,kills,hs,assists,flash_assists,kdratio,deaths,...,adr,fkdiff,rating,teamname,winner,winner_id,total_rounds,mapstatid,win_rounds,map
15428,15428,Goofy,2342272,2020-06-18,29,7,5,0,5,16,...,113,1,1.57,PACT,PACT,8248,28.0,104462,16,Nuke
15940,15940,Sobol,2342272,2020-06-18,21,13,3,0,3,17,...,69,-1,1.01,PACT,PACT,8248,28.0,104462,16,Nuke
2826,2826,MINISE,2342272,2020-06-18,15,6,5,1,5,15,...,60,2,1.01,PACT,PACT,8248,28.0,104462,16,Nuke
11287,11287,darko,2342272,2020-06-18,14,5,7,2,7,14,...,51,-2,0.95,PACT,PACT,8248,28.0,104462,16,Nuke
9798,9798,lunAtic,2342272,2020-06-18,14,11,5,0,5,23,...,69,2,0.85,PACT,PACT,8248,28.0,104462,16,Nuke
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8324,8324,morelz,2342216,2020-06-16,21,9,2,1,2,21,...,70,-2,1.01,mens,Singularity,6978,25.0,104344,16,Train
8416,8416,ayken,2342216,2020-06-16,13,7,4,0,4,20,...,70,3,0.90,mens,Singularity,6978,25.0,104344,16,Train
5480,5480,jOELZ,2342216,2020-06-16,10,4,4,0,4,21,...,55,-1,0.70,mens,Singularity,6978,25.0,104344,16,Train
11938,11938,crank,2342216,2020-06-16,12,7,3,0,3,22,...,58,-1,0.65,mens,Singularity,6978,25.0,104344,16,Train


In [18]:
ddf.dtypes

player_id          int64
player            object
match_id           int64
date              object
kills              int64
hs                 int64
assists            int64
flash_assists      int64
kdratio            int64
deaths             int64
kddiff             int64
adr                int64
fkdiff             int64
rating           float64
teamname          object
winner            object
winner_id          int64
total_rounds     float64
mapstatid          int64
win_rounds         int64
map               object
dtype: object

## debug

### result decode