In [44]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
import time
import random

### Get Data From Single Page

In [28]:
# Create headers to emulate browsing and send GET request to URL
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
cod_site_data = requests.get('https://callofdutyleague.com/en-us/match/3638', headers=headers)

### Explore Content

After looking through the returned content, found the site is generated from JSON. The when using Google Inspector, the HTML structure is clear, but this is not what is returned from the request.

The data is converted into a string to be split in a manner that isolations the JSON. The json python package is then used to convert the string to json, so the data can be accessed through its key-value structure.

In [29]:
# Stores content as bytes-like object
cod_site_content = cod_site_data.content

# Convert to BeautifulSoup object for parsing then convert to string
soup = BeautifulSoup(cod_site_content)
string_soup = str(soup)

# Split the data at intersections of JSON object
string_soup_list = string_soup.split('type="application/json">')
string_soup_list_1 = string_soup_list[1].split('</script>')

# Load string data into JSON
content_dict = json.loads(string_soup_list_1[0])

### Parse Data from Content

Now that the JSON structure is determined, it is possible to extract the data through indexing.

In [30]:
match_id = 3638

# Locate wanted data
content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']

# Get Home and Away Team info
home_team = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']['name']
home_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']['id']
away_team = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['awayTeamCard']['name']
away_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['awayTeamCard']['id']

# Get total matches won by each team and final result
home_team_wins = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['homeTeamGamesWon']
away_team_wins = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['awayTeamGamesWon']
winning_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['winnerTeamId']
loser_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['loserTeamId']

In [31]:
# Locate the individual games within each match
match_games = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchGamesExtended']

# Get game-specific data (map, score, mode, etc) and store to list for appending to DataFrame
matches = [match_id, home_team, home_team_id, away_team, away_team_id, home_team_wins, away_team_wins, winning_team_id, loser_team_id]
counter = 0

for game in match_games:
    counter += 1
    print(counter)
    match_no = game['matchGame']['number']
    match_map = game['matchGame']['map']
    mode = game['matchGame']['mode']
    locale = game['matchGame']['gameMap']['locale']
    try:
        home_score = game['matchGameResult']['hostGameScore']
        away_score = game['matchGameResult']['guestGameScore']
        winning_team = game['matchGameResult']['winnerTeamId']
        losing_team = game['matchGameResult']['loserTeamId']
    except:
        home_score = 0
        away_score = 0
        winning_team = 0
        losing_team = 0
        pass
    matches.extend([match_no, match_map, mode, locale, home_score, away_score, winning_team, losing_team])

1
2
3
4
5


### Save Content

In [33]:
# Convert to numpy array for easy transformation
matches = np.array(matches)
matches = matches.reshape(1, 49)

# Convert to DataFrame
matches_df = pd.DataFrame(
    matches,
    columns=['match_id', 'home_team', 'home_team_id', 'away_team', 'away_team_id', 'home_team_wins', 'away_team_wins', 'winning_team_id', 'losing_team_id', 'game_1_id', 'game_1_map', 'game_1_gametype', 'game_1_lang', 'game_1_home_score', 'game_1_away_score', 'game_1_winning_team', 'game_1_losing_team', 'game_2_id', 'game_2_map', 'game_2_gametype', 'game_2_lang', 'game_2_home_score', 'game_2_away_score', 'game_2_winning_team', 'game_2_losing_team', 'game_3_id', 'game_3_map', 'game_3_gametype', 'game_3_lang', 'game_3_home_score', 'game_3_away_score', 'game_3_winning_team', 'game_3_losing_team', 'game_4_id', 'game_4_map', 'game_4_gametype', 'game_4_lang', 'game_4_home_score', 'game_4_away_score', 'game_4_winning_team', 'game_4_losing_team', 'game_5_id', 'game_5_map', 'game_5_gametype', 'game_5_lang', 'game_5_home_score', 'game_5_away_score', 'game_5_winning_team', 'game_5_losing_team']
)

# Store to CSV for later use
matches_df.to_csv('data/2020_reg_season_matches.csv', index=False)

### Get All Match IDs to Programmatically Get Pages

This is essentially the same process as above, but for the regular season page to grab each match's page from each series.

In [9]:
code_sites = requests.get('https://callofdutyleague.com/en-us/schedule?utm_source=cdlweb&utm_medium=navigationbar&utm_campaign=general', headers=headers)
all_matches = code_sites.content

In [10]:
match_soup = BeautifulSoup(all_matches)

match_soup_string = str(match_soup)

match_soup_list = match_soup_string.split('type="application/json">')
match_soup_list_1 = match_soup_list[1].split('</script>')

reg_season = json.loads(match_soup_list_1[0])['props']['pageProps']['blocks'][2]['tabs']['tabs'][0]['blocks'][0]['tabs']['tabs']

In [11]:
match_ids = []

for series in reg_season:
    final_matches = series['blocks'][2]['tabs']['tabs'][0]['blocks'][0]['cdlMatchCards']['finalMatches']
    for match in final_matches:
        match_ids.append(match['match']['id'])
    print(series['title'])

In [12]:
ids_df = pd.DataFrame({
    'ids': match_ids
})

ids_df.to_csv('data/2020_reg_season_match_ids.csv', index=False)

In [34]:
ids = pd.read_csv('data/2020_reg_season_match_ids.csv')

In [35]:
ids = list(ids['ids'])

In [40]:
ids

[1979,
 1962,
 1982,
 1936,
 1969,
 1960,
 1932,
 1970,
 1942,
 1934,
 1948,
 1965,
 2804,
 2802,
 2803,
 2805,
 2846,
 2858,
 2847,
 2859,
 2881,
 2883,
 2887,
 2886,
 2888,
 2891,
 2889,
 2892,
 2890,
 2983,
 2984,
 3006,
 3007,
 3030,
 3041,
 3050,
 3051,
 3054,
 3266,
 3261,
 3267,
 3262,
 3268,
 3269,
 3263,
 3264,
 3270,
 3265,
 3271,
 3272,
 3273,
 3279,
 3274,
 3280,
 3275,
 3281,
 3276,
 3282,
 3277,
 3283,
 3278,
 3284,
 3285,
 3286,
 3555,
 3560,
 3561,
 3556,
 3562,
 3557,
 3563,
 3558,
 3564,
 3559,
 3565,
 3566,
 3567,
 3573,
 3574,
 3568,
 3569,
 3575,
 3570,
 3576,
 3571,
 3577,
 3572,
 3578,
 3579,
 3580,
 3586,
 3581,
 3587,
 3582,
 3588,
 3583,
 3589,
 3584,
 3590,
 3585,
 3591,
 3592,
 3593,
 3599,
 3594,
 3600,
 3595,
 3601,
 3596,
 3602,
 3597,
 3603,
 3598,
 3604,
 3605,
 3606,
 3607,
 3608,
 3612,
 3613,
 3614,
 3609,
 3615,
 3610,
 3616,
 3611,
 3617,
 3618,
 3619,
 3620,
 3625,
 3621,
 3626,
 3622,
 3627,
 3623,
 3628,
 3624,
 3629,
 3630,
 3631,
 3632,
 3651,

### Turn it all into Functions

In [49]:
def get_page_data(_id):
    """
    Trigger request to get url with appended match id. Converts 
    it from bytes type to final JSON object.
    
    INPUT: Integer id of match.
    
    OUTPUT: JSON object of page data.
    """
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
               "Accept-Encoding":"gzip, deflate",
               "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "DNT":"1","Connection":"close",
               "Upgrade-Insecure-Requests":"1"}
    
    data = requests.get(f'https://callofdutyleague.com/en-us/match/{_id}', headers=headers)
    # Stores content as bytes-like object
    content = data.content

    # Convert to BeautifulSoup object for parsing then convert to string
    soup = BeautifulSoup(content)
    string_soup = str(soup)

    # Split the data at intersections of JSON object
    string_soup_list = string_soup.split('type="application/json">')
    string_soup_list_1 = string_soup_list[1].split('</script>')

    # Load string data into JSON
    content_dict = json.loads(string_soup_list_1[0])

    return content_dict

def parse_page_data(_id, data):
    """
    Parse through page data to collect desired data points.
    
    INPUT: JSON object of page data.
    
    OUTPUT: List of data points from page.
    """
    
    match_id = _id
    
    # Locate wanted data
    content_dict = data['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']

    # Get Home and Away Team info
    home_team = content_dict['homeTeamCard']['name']
    home_team_id = content_dict['homeTeamCard']['id']
    away_team = content_dict['awayTeamCard']['name']
    away_team_id = content_dict['awayTeamCard']['id']

    # Get total matches won by each team and final result
    home_team_wins = content_dict['result']['homeTeamGamesWon']
    away_team_wins = content_dict['result']['awayTeamGamesWon']
    winning_team_id = content_dict['result']['winnerTeamId']
    loser_team_id = content_dict['result']['loserTeamId']

    # Locate the individual games within each match
    match_games = data['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchGamesExtended']

    # Get game-specific data (map, score, mode, etc) and store to list for appending to DataFrame
    matches = [match_id, home_team, home_team_id, away_team, away_team_id, home_team_wins, away_team_wins, winning_team_id, loser_team_id]
    counter = 0
    
    # Iterate through each game in each match
    for game in match_games:
        counter += 1
        print(counter)
        match_no = game['matchGame']['number']
        match_map = game['matchGame']['map']
        mode = game['matchGame']['mode']
        locale = game['matchGame']['gameMap']['locale']
        try:
            home_score = game['matchGameResult']['hostGameScore']
            away_score = game['matchGameResult']['guestGameScore']
            winning_team = game['matchGameResult']['winnerTeamId']
            losing_team = game['matchGameResult']['loserTeamId']
        except:
            home_score = 0
            away_score = 0
            winning_team = 0
            losing_team = 0
            pass
        matches.extend([match_no, match_map, mode, locale, home_score, away_score, winning_team, losing_team])
    return matches
        

def store_match_data(df, match):
    """
    Append list of data points to Pandas DataFrame of previous match data.
    
    INPUT: Pandas DataFrame of previously collected match data,
    list of newly collected match data.
    
    OUTPUT: Update Pandas DataFrame of match data.
    """
    to_append = match
    df_length = len(df)
    df.loc[df_length] = to_append
    return df

def collect_all_pages(ids, df):
    match_data_df = df
    for _id in ids:
        page_data = get_page_data(_id)
        match_data = parse_page_data(_id, page_data)
        match_data_df = store_match_data(df, match_data)
        time.sleep(random.randint(5,12))
    return match_data_df

In [54]:
match_df = pd.read_csv('data/2020_reg_season_matches.csv')

In [51]:
match_df = collect_all_pages(ids, match_df)

1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5
1
2
3
4
5


In [53]:
match_df.to_csv('data/2020_reg_season_matches.csv', index=False)

In [50]:
time.sleep(random.randint(4,10))

In [55]:
match_df

Unnamed: 0,match_id,home_team,home_team_id,away_team,away_team_id,home_team_wins,away_team_wins,winning_team_id,losing_team_id,game_1_id,...,game_4_winning_team,game_4_losing_team,game_5_id,game_5_map,game_5_gametype,game_5_lang,game_5_home_score,game_5_away_score,game_5_winning_team,game_5_losing_team
0,3638,Atlanta FaZe,7,Minnesota Røkkr,12,3,0,7,12,0,...,0,0,4,Rammaza,Search and Destroy,en-us,0,0,0,0
1,1979,Dallas Empire,6,Chicago Huntsmen,8,1,3,8,6,0,...,8,6,4,Piccadilly,Search and Destroy,en-us,0,0,0,0
2,1979,Dallas Empire,6,Chicago Huntsmen,8,1,3,8,6,0,...,8,6,4,Piccadilly,Search and Destroy,en-us,0,0,0,0
3,1962,Florida Mutineers,9,Seattle Surge,16,3,2,9,16,0,...,9,16,4,Rammaza,Search and Destroy,en-us,6,0,9,16
4,1982,Minnesota Røkkr,12,Los Angeles Guerrillas,11,3,1,12,11,0,...,12,11,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,3642,Dallas Empire,6,Florida Mutineers,9,3,0,6,9,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
166,3637,Chicago Huntsmen,8,OpTic Gaming Los Angeles,14,1,3,14,8,0,...,14,8,4,Gun Runner,Search and Destroy,en-us,0,0,0,0
167,3643,Dallas Empire,6,Toronto Ultra,17,1,3,17,6,0,...,17,6,4,Piccadilly,Search and Destroy,en-us,0,0,0,0
168,3644,Atlanta FaZe,7,OpTic Gaming Los Angeles,14,3,0,7,14,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0


In [59]:
match_df = match_df[2:]

In [60]:
match_df

Unnamed: 0,match_id,home_team,home_team_id,away_team,away_team_id,home_team_wins,away_team_wins,winning_team_id,losing_team_id,game_1_id,...,game_4_winning_team,game_4_losing_team,game_5_id,game_5_map,game_5_gametype,game_5_lang,game_5_home_score,game_5_away_score,game_5_winning_team,game_5_losing_team
2,1979,Dallas Empire,6,Chicago Huntsmen,8,1,3,8,6,0,...,8,6,4,Piccadilly,Search and Destroy,en-us,0,0,0,0
3,1962,Florida Mutineers,9,Seattle Surge,16,3,2,9,16,0,...,9,16,4,Rammaza,Search and Destroy,en-us,6,0,9,16
4,1982,Minnesota Røkkr,12,Los Angeles Guerrillas,11,3,1,12,11,0,...,12,11,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
5,1936,London Royal Ravens,10,New York Subliners,13,3,0,10,13,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
6,1969,Toronto Ultra,17,Seattle Surge,16,3,2,17,16,0,...,17,16,4,Rammaza,Search and Destroy,en-us,6,5,17,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,3642,Dallas Empire,6,Florida Mutineers,9,3,0,6,9,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
166,3637,Chicago Huntsmen,8,OpTic Gaming Los Angeles,14,1,3,14,8,0,...,14,8,4,Gun Runner,Search and Destroy,en-us,0,0,0,0
167,3643,Dallas Empire,6,Toronto Ultra,17,1,3,17,6,0,...,17,6,4,Piccadilly,Search and Destroy,en-us,0,0,0,0
168,3644,Atlanta FaZe,7,OpTic Gaming Los Angeles,14,3,0,7,14,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0


In [61]:
match_df.sort_values('match_id')

Unnamed: 0,match_id,home_team,home_team_id,away_team,away_team_id,home_team_wins,away_team_wins,winning_team_id,losing_team_id,game_1_id,...,game_4_winning_team,game_4_losing_team,game_5_id,game_5_map,game_5_gametype,game_5_lang,game_5_home_score,game_5_away_score,game_5_winning_team,game_5_losing_team
8,1932,OpTic Gaming Los Angeles,14,Paris Legion,15,2,3,15,14,0,...,14,15,4,Rammaza,Search and Destroy,en-us,3,6,15,14
11,1934,OpTic Gaming Los Angeles,14,Chicago Huntsmen,8,0,3,8,14,0,...,0,0,4,St. Petrograd,Search and Destroy,en-us,0,0,0,0
5,1936,London Royal Ravens,10,New York Subliners,13,3,0,10,13,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
10,1942,London Royal Ravens,10,Paris Legion,15,0,3,15,10,0,...,0,0,4,Gun Runner,Search and Destroy,en-us,0,0,0,0
12,1948,New York Subliners,13,Atlanta FaZe,7,1,3,7,13,0,...,7,13,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,3654,Paris Legion,15,Los Angeles Guerrillas,11,3,1,15,11,0,...,15,11,4,Gun Runner,Search and Destroy,en-us,0,0,0,0
152,3655,New York Subliners,13,Paris Legion,15,0,3,15,13,0,...,0,0,4,Arklov Peak,Search and Destroy,en-us,0,0,0,0
154,3656,London Royal Ravens,10,Paris Legion,15,2,3,15,10,0,...,15,10,4,Gun Runner,Search and Destroy,en-us,5,6,15,10
155,3657,Dallas Empire,6,Florida Mutineers,9,3,0,6,9,0,...,0,0,4,Gun Runner,Search and Destroy,en-us,0,0,0,0


In [62]:
match_df['match_id'].unique()

array([1979, 1962, 1982, 1936, 1969, 1960, 1932, 1970, 1942, 1934, 1948,
       1965, 2804, 2802, 2803, 2805, 2846, 2858, 2847, 2859, 2881, 2883,
       2887, 2886, 2888, 2891, 2889, 2892, 2890, 2983, 2984, 3006, 3007,
       3030, 3041, 3050, 3051, 3054, 3266, 3261, 3267, 3262, 3268, 3269,
       3263, 3264, 3270, 3265, 3271, 3272, 3273, 3279, 3274, 3280, 3275,
       3281, 3276, 3282, 3277, 3283, 3278, 3284, 3285, 3286, 3555, 3560,
       3561, 3556, 3562, 3557, 3563, 3558, 3564, 3559, 3565, 3566, 3567,
       3573, 3574, 3568, 3569, 3575, 3570, 3576, 3571, 3577, 3572, 3578,
       3579, 3580, 3586, 3581, 3587, 3582, 3588, 3583, 3589, 3584, 3590,
       3585, 3591, 3592, 3593, 3599, 3594, 3600, 3595, 3601, 3596, 3602,
       3597, 3603, 3598, 3604, 3605, 3606, 3607, 3608, 3612, 3613, 3614,
       3609, 3615, 3610, 3616, 3611, 3617, 3618, 3619, 3620, 3625, 3621,
       3626, 3622, 3627, 3623, 3628, 3624, 3629, 3630, 3631, 3632, 3651,
       3646, 3652, 3647, 3653, 3648, 3654, 3649, 36

In [63]:
match_df.to_csv('data/2020_reg_season_matches.csv', index=False)