In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json

### Get Data From Single Page

In [2]:
# Create headers to emulate browsing and send GET request to URL

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
cod_site_data = requests.get('https://callofdutyleague.com/en-us/match/3638', headers=headers)

### Explore Content

After looking through the returned content, found the site is generated from JSON. The when using Google Inspector, the HTML structure is clear, but this is not what is returned from the request.

The data is converted into a string to be split in a manner that isolations the JSON. The json python package is then used to convert the string to json, so the data can be accessed through its key-value structure.

In [9]:
# Stores content as bytes-like object
cod_site_content = cod_site_data.content

# Convert to BeautifulSoup object for parsing then convert to string
soup = BeautifulSoup(cod_site_content)
string_soup = str(soup)

# Split the data at intersections of JSON object
string_soup_list = string_soup.split('type="application/json">')
string_soup_list_1 = string_soup_list[1].split('</script>')

# Load string data into JSON
content_dict = json.loads(string_soup_list_1[0])

### Parse Data from Content

Now that the JSON structure is determined, it is possible to extract the data through indexing.

In [None]:
# Locate wanted data
content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']

# Get Home and Away Team info
home_team = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']['name']
home_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']['id']
away_team = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['awayTeamCard']['name']
away_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['awayTeamCard']['id']

# Get total matches won by each team and final result
home_team_wins = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['homeTeamGamesWon']
away_team_wins = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['awayTeamGamesWon']
winning_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['winnerTeamId']
loser_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['loserTeamId']

In [None]:
# Locate the individual games within each match
match_games = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchGamesExtended']

# Get game-specific data (map, score, mode, etc) and store to list for appending to DataFrame
matches = [home_team, home_team_id, away_team, away_team_id, home_team_wins, away_team_wins, winning_team_id, loser_team_id]
counter = 0

for game in match_games:
    counter += 1
    print(counter)
    match_no = game['matchGame']['number']
    match_map = game['matchGame']['map']
    mode = game['matchGame']['mode']
    locale = game['matchGame']['gameMap']['locale']
    try:
        home_score = game['matchGameResult']['hostGameScore']
        away_score = game['matchGameResult']['guestGameScore']
        winning_team = game['matchGameResult']['winnerTeamId']
        losing_team = game['matchGameResult']['loserTeamId']
    except:
        home_score = 0
        away_score = 0
        winning_team = 0
        losing_team = 0
        pass
    matches.extend([match_no, match_map, mode, locale, home_score, away_score, winning_team, losing_team])

### Save Content

In [None]:
# Convert to numpy array for easy transformation
matches = np.array(matches)
matches = matches.reshape(1, 48)

# Convert to DataFrame
matches_df = pd.DataFrame(
    matches,
    columns=['home_team', 'home_team_id', 'away_team', 'away_team_id', 'home_team_wins', 'away_team_wins', 'winning_team_id', 'losing_team_id', 'match_1_id', 'match_1_map', 'match_1_gametype', 'match_1_lang', 'match_1_home_score', 'match_1_away_score', 'match_1_winning_team', 'match_1_losing_team', 'match_2_id', 'match_2_map', 'match_2_gametype', 'match_2_lang', 'match_2_home_score', 'match_2_away_score', 'match_2_winning_team', 'match_2_losing_team', 'match_3_id', 'match_3_map', 'match_3_gametype', 'match_3_lang', 'match_3_home_score', 'match_3_away_score', 'match_3_winning_team', 'match_3_losing_team', 'match_4_id', 'match_4_map', 'match_4_gametype', 'match_4_lang', 'match_4_home_score', 'match_4_away_score', 'match_4_winning_team', 'match_4_losing_team', 'match_5_id', 'match_5_map', 'match_5_gametype', 'match_5_lang', 'match_5_home_score', 'match_5_away_score', 'match_5_winning_team', 'match_5_losing_team']
)

# Store to CSV for later use
matches_df.to_csv('data/2020_reg_season_matches.csv', index=False)

### Get All Match IDs to Programmatically Get Pages

This is essentially the same process as above, but for the regular season page to grab each match's page from each series.

In [9]:
code_sites = requests.get('https://callofdutyleague.com/en-us/schedule?utm_source=cdlweb&utm_medium=navigationbar&utm_campaign=general', headers=headers)
all_matches = code_sites.content

In [10]:
match_soup = BeautifulSoup(all_matches)

match_soup_string = str(match_soup)

match_soup_list = match_soup_string.split('type="application/json">')
match_soup_list_1 = match_soup_list[1].split('</script>')

reg_season = json.loads(match_soup_list_1[0])['props']['pageProps']['blocks'][2]['tabs']['tabs'][0]['blocks'][0]['tabs']['tabs']

In [11]:
match_ids = []

for series in reg_season:
    final_matches = series['blocks'][2]['tabs']['tabs'][0]['blocks'][0]['cdlMatchCards']['finalMatches']
    for match in final_matches:
        match_ids.append(match['match']['id'])
    print(series['title'])

In [12]:
ids_df = pd.DataFrame({
    'ids': match_ids
})

ids_df.to_csv('data/2020_reg_season_match_ids.csv')

In [16]:
ids = pd.read_csv('data/2020_reg_season_match_ids.csv')

In [17]:
ids = list(ids['ids'])

### Turn it all into Functions

In [24]:
def get_page_data(_id):
    """
    Trigger request to get url with appended match id. Converts 
    it from bytes type to final JSON object.
    
    INPUT: Integer id of match.
    
    OUTPUT: JSON object of page data.
    """
    data = requests.get()
    # Stores content as bytes-like object
    content = data.content

    # Convert to BeautifulSoup object for parsing then convert to string
    soup = BeautifulSoup(content)
    string_soup = str(soup)

    # Split the data at intersections of JSON object
    string_soup_list = string_soup.split('type="application/json">')
    string_soup_list_1 = string_soup_list[1].split('</script>')

    # Load string data into JSON
    content_dict = json.loads(string_soup_list_1[0])

    page_data.append(content_dict)
    return page_data

def parse_page_data(data):
    """
    Parse through page data to collect desired data points.
    
    INPUT: JSON object of page data.
    
    OUTPUT: List of data points from page.
    """
    # Locate wanted data
    content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']

    # Get Home and Away Team info
    home_team = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']['name']
    home_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['homeTeamCard']['id']
    away_team = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['awayTeamCard']['name']
    away_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['awayTeamCard']['id']

    # Get total matches won by each team and final result
    home_team_wins = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['homeTeamGamesWon']
    away_team_wins = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['awayTeamGamesWon']
    winning_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['winnerTeamId']
    loser_team_id = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchExtended']['result']['loserTeamId']

    # Locate the individual games within each match
    match_games = content_dict['props']['pageProps']['blocks'][1]['cdlMatchDetail']['matchData']['matchGamesExtended']

    # Get game-specific data (map, score, mode, etc) and store to list for appending to DataFrame
    matches = [home_team, home_team_id, away_team, away_team_id, home_team_wins, away_team_wins, winning_team_id, loser_team_id]
    counter = 0
    
    # Iterate through each game in each match
    for game in match_games:
        counter += 1
        print(counter)
        match_no = game['matchGame']['number']
        match_map = game['matchGame']['map']
        mode = game['matchGame']['mode']
        locale = game['matchGame']['gameMap']['locale']
        try:
            home_score = game['matchGameResult']['hostGameScore']
            away_score = game['matchGameResult']['guestGameScore']
            winning_team = game['matchGameResult']['winnerTeamId']
            losing_team = game['matchGameResult']['loserTeamId']
        except:
            home_score = 0
            away_score = 0
            winning_team = 0
            losing_team = 0
            pass
        matches.extend([match_no, match_map, mode, locale, home_score, away_score, winning_team, losing_team])
    return matches
        

def store_match_data(df, match):
    """
    Append list of data points to Pandas DataFrame of previous match data.
    
    INPUT: Pandas DataFrame of data. 
    
    OUTPUT: List of data points from page.
    """
    to_append = match
    df_length = len(df)
    df.loc[df_length] = to_append
    return df