# Exploration sandbox of MLS Fantasy stats page

In [1]:
import pandas as pd
import numpy as np
import time
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from typing import List, Tuple

import matplotlib.pyplot as plt
%matplotlib inline

### Exploration of the mls fantasy pages.

#### Pages needed:
    1. Player pages - ex. for Christian Roldan: https://fantasy.mlssoccer.com/#stats-center/player-profile/172762
    2. All players page - https://fantasy.mlssoccer.com/#stats-center
        - Will need to use Selenium to navigate team by team to their pages, and then to pick each player
    3. Standing page? - https://www.mlssoccer.com/standings
    4. Expected goals/expected assists page (potentially from American Soccer Analysis)? https://www.americansocceranalysis.com/player-xg-2019

In [2]:
def clean_data(text: str) -> List[str]:
    """Will receive a string and convert to a list of strings with the totals for each category 
    for the specific game.
    """
    word = ''
    dont_want = ['\t', '\n', '\\']
    row = []
    for char in text:
        if char not in dont_want:
            word += char
            if word == '-':
                row.append(word)
                word = ''
        elif word:
            row.append(word)
            word = ''
    return row

In [3]:
def get_weekly_info(page_table_obj, week: int) -> List[str]:
    pass
    #return clean_data(page_table_obj[week]) + [int(entry) for entry in clean_data(page_table_obj[week + 37])[0::2]]

## NEED TO FIX, I THINK I'M PASSING IN A LIST, NOT A STRING

In [4]:
def get_player_salary(player_info: str) -> float:
    '''Method will receive the raw string from the player's info and return the salary as a float
    '''
    
    assert type(player_info[2]) == str, "input needs to be a string"
    assert len(player_info[2]) > 3, "input is not the salary text "
    
    sal = ''
    for char in player_info[2][-2::-1]:
        if char != " ":
            sal = char + sal
        else:
            return float(sal)
    return float(sal)

In [5]:
def get_player_team(player_info: str) -> str:
    '''Method will receive the raw string from the player's info and return the team as a str
    '''
    
    assert type(player_info[0]) == str, "input needs to be a string"
    assert len(player_info[0]) > 0, "input cannot be empty"
    
    return player_info[0]

In [6]:
def get_player_position(player_info: str) -> str:
    '''Method will receive the raw string from the player's info and return their position as an initial.
    G is for Goalkeeper
    D is for Defender
    M is for Midfielder
    F is for Forward
    '''
    
    assert type(player_info[1]) == str, "input needs to be a string"
    assert len(player_info[1]) > 0, "input cannot be empty"
    
    return player_info[1][0]

#### Start player page exploration using Christian Roldan
https://fantasy.mlssoccer.com/#stats-center/player-profile/172762

Using Selenium, per advice from Miles

In [23]:
browser = Chrome()

In [24]:
page_link = 'https://fantasy.mlssoccer.com/#stats-center/player-profile/172762'

In [25]:
browser.get(page_link)
time.sleep(5)

In [26]:
html = browser.page_source

In [27]:
# create the soup object that holds the text from the page
soup = BeautifulSoup(html, 'html.parser')

In [28]:
# select code from the page that describes info about the player, store in a list
players = soup.select('div.player-info-wrapper')

In [29]:
# create a list of text from players object
player_text = [player.text for player in players]

In [30]:
player_text

['\n\n\n\n\nSeattle Sounders FC\n\n\t\t\t\t\tMidfielder\n\t\t\t\t\t\n\xa0|\xa0$ 9.5m\n\n\nWeek change: -\nSeason change: -\n\n\n More info\n\n\n']

In [31]:
# when processing the player_text list, output is a list with 6 strings: team, position, playing
# status, weekly salary change, season salary change, and a button for more info, which is not useful
# for the purpose of this project 
clean_data(player_text[0])

['Seattle Sounders FC',
 'Midfielder',
 '\xa0|\xa0$ 9.5m',
 'Week change: -',
 'Season change: -',
 ' More info']

In [17]:
for i, info in enumerate(clean_data(player_text[0])):
    print(f"index {i} --> " + info + "\n")

index 0 --> Seattle Sounders FC

index 1 --> Midfielder

index 2 -->  | $ 9.5m

index 3 --> Week change: -

index 4 --> Season change: -

index 5 -->  More info



In [240]:
get_player_salary(clean_data(player_text[0])), type(get_player_salary(clean_data(player_text[0])))

(9.8, float)

In [18]:
get_player_team(clean_data(player_text[0]))

'Seattle Sounders FC'

In [19]:
get_player_position(clean_data(player_text[0]))

'M'

In [20]:
clean_data(player_text[0])[0]

'Seattle Sounders FC'

In [None]:
def get_player_info(raw_player_data: List[str]) -> Tuple[str, float]:
    '''Will receive a string and break out the team position, player salary, weekly salary change, and season
    salary change
    '''
    pass

In [21]:
# select code from the page that gives the players first initial and last name
# will likely use the ID from the page URL as the identifier for the player, rather than the name
names = soup.select('div.profile-name-wrapper')

In [22]:
name_text = [name.text for name in names]

In [23]:
clean_data(name_text[0])

['C. Roldan']

In [24]:
# table grabs all the data by row/game in the table of how points are calculated
table = soup.select('div.row-table')

In [25]:
# table_text is converting the beautifulsoup object of each row into a list of texts for later cleaning
table_text = [stats.text for stats in table]

In [29]:
# table_text is all the table of fantasy data. On the page it looks like two tables, with row 1 lining up with
# row 38, and the remainder rows lining up row n with row (n + 37)
# the 0th index of table_text is the column header for all the team games
# it appears first half of table_text are the column headers. 
# index 1 through 34 is the game info: game week, @ or vs reflecting away or home game, the opposing team, and
# total fantasy points for the week
# index 35 is the left table 'all'
# index 36 is the right table header hover view of the column categories
# index 37 is the right table header initials of the column categories
# index 38 through 71 are the right table data, including category totals, and then fantasy team totals in ()
# index 72 is the last row and is the aggregated season totals for all the categories and the fantasy points
table_text[72]

'\n\n\n\t\t\t\t\t\t\t\t2538\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(57)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t6\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(30)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t5\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(15)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t9\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(9)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t41\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t4\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(-4)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t1\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(-3)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t49\n\t\t\t\t\t\t\t\n

In [30]:
clean_data(table_text[1]) + [int(entry) for entry in clean_data(table_text[38])[0::2]]

['1',
 'vs',
 'FC Cincinnati',
 '5',
 90,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 80,
 3,
 1,
 0,
 2,
 1,
 2,
 3,
 0,
 0,
 1,
 0]

In [31]:
clean_data(table_text[3])

['3', '@', 'Chicago Fire', '4']

In [32]:
for i, text in enumerate(table_text):
    print(f"row {i} --> {clean_data(text)} \n")

row 0 --> ['RD', 'OPPONENT', 'PTS'] 

row 1 --> ['1', 'vs', 'FC Cincinnati', '5'] 

row 2 --> ['2', 'vs', 'Colorado Rapids', '7'] 

row 3 --> ['3', '@', 'Chicago Fire', '4'] 

row 4 --> ['5', '@', 'Vancouver Whitecaps FC', '8'] 

row 5 --> ['6', 'vs', 'Real Salt Lake', '7'] 

row 6 --> ['7', 'vs', 'Toronto FC', '9'] 

row 7 --> ['8', '@', 'Los Angeles FC', '5'] 

row 8 --> ['9', 'vs', 'San Jose Earthquakes', '3'] 

row 9 --> ['9', 'vs', 'Los Angeles FC', '-', '2'] 

row 10 --> ['10', '@', 'Minnesota United FC', '10'] 

row 11 --> ['11', 'vs', 'Houston Dynamo', '9'] 

row 12 --> ['12', 'vs', 'Orlando City SC', '3'] 

row 13 --> ['12', '@', 'Philadelphia Union', '4'] 

row 14 --> ['13', '@', 'Sporting Kansas City', '5'] 

row 15 --> ['14', '@', 'FC Dallas', '2'] 

row 16 --> ['15', '@', 'Montreal Impact', '0'] 

row 17 --> ['17', 'vs', 'Vancouver Whitecaps FC', '0'] 

row 18 --> ['18', '@', 'New York City FC', '0'] 

row 19 --> ['18', '@', 'Columbus Crew SC', '0'] 

row 20 --> ['19', 'vs

In [34]:
t36 = clean_data(table_text[36])

In [35]:
t36[1::2]

['MIN',
 'GF',
 'A',
 'CS',
 'PS',
 'PE',
 'PM',
 'GA',
 'SV',
 'Y',
 'R',
 'OG',
 'T',
 'P',
 'KP',
 'CRS',
 'BC',
 'CL',
 'BLK',
 'INT',
 'BR',
 'ELG',
 'OGA',
 'SH',
 'WF']

In [36]:
table_text[38]

'\n\n\n\t\t\t\t\t\t\t\t90\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t(2)\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t1\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\t-

In [37]:
t38 = clean_data(table_text[38])

In [42]:
t38[0::2]

['90',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '2',
 '80',
 '3',
 '1',
 '0',
 '2',
 '1',
 '2',
 '3',
 '0',
 '0',
 '1',
 '0']

In [43]:
t1 = clean_data(table_text[1])

In [44]:
t1

['1', 'vs', 'FC Cincinnati', '5']

In [45]:
t2 = clean_data(table_text[2])

In [46]:
t2

['2', 'vs', 'Colorado Rapids', '7']

In [47]:
t1 + t38[0::2]

['1',
 'vs',
 'FC Cincinnati',
 '5',
 '90',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '2',
 '80',
 '3',
 '1',
 '0',
 '2',
 '1',
 '2',
 '3',
 '0',
 '0',
 '1',
 '0']

In [48]:
clean_data(table_text[1])

['1', 'vs', 'FC Cincinnati', '5']

In [50]:
len(clean_data(table_text[38])[0::2])

25

In [51]:
clean_data(table_text[8]) + clean_data(table_text[45])[0::2]

['9',
 'vs',
 'San Jose Earthquakes',
 '3',
 '90',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '4',
 '56',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '2',
 '0',
 '0',
 '1',
 '0']

In [54]:
clean_data(table_text[9]) + clean_data(table_text[46])[0::2]

['9',
 'vs',
 'Los Angeles FC',
 '-',
 '2',
 '18',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '2',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [61]:
(len(clean_data(table_text[8]) + clean_data(table_text[45])[0::2]),
len(clean_data(table_text[9]) + clean_data(table_text[46])[0::2]),
len(clean_data(table_text[10]) + clean_data(table_text[47])[0::2]),
len(clean_data(table_text[11]) + clean_data(table_text[48])[0::2]),
len(clean_data(table_text[12]) + clean_data(table_text[49])[0::2]),
 len(clean_data(table_text[13]) + clean_data(table_text[50])[0::2]))

(29, 30, 29, 29, 29, 29)

In [66]:
len(clean_data(table_text[52]))

50

In [76]:
clean_data(table_text[52])[0::2]

['90',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '1',
 '64',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '4',
 '0',
 '0',
 '0',
 '2']

In [100]:
# this tells me that .split() creates a list based on where a hyphen happens, whereas strip will remove the 
# hyphen from the string. This is helpful to know later when I will remove stats from a category with basically
# an n/a for that category; the website puts a hyphen as a placeholder.

'--'.split('-'), ''.strip('-')

(['', '', ''], '')

In [77]:
# data for games start at index 38
game_one = table_text[38]

In [78]:
# 
game_two = table_text[39]

In [79]:
# create a variable game_one_data which removes \n and \t and \\ from the row data
game_one_data = clean_data(game_one)

In [80]:
game_two_data = clean_data(game_two)

In [81]:
len(game_one_data)

50

In [82]:
game_one_data[::2] # these are the actual totals for each category

['90',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '2',
 '80',
 '3',
 '1',
 '0',
 '2',
 '1',
 '2',
 '3',
 '0',
 '0',
 '1',
 '0']

In [83]:
game_one_data[1::2] # these are the fantasy points for each category total

['(2)',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '(2)',
 '(1)',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-']

In [84]:
game_one_points = [int(point[1:-1])
                  for point in game_one_data[1::2] if point.strip('-')]

In [85]:
game_one_points

[2, 2, 1]

In [86]:
sum(game_one_points)

5

In [87]:
game_two_data[1::2]

['(2)',
 '-',
 '-',
 '(1)',
 '-',
 '-',
 '-',
 '-',
 '-',
 '(-1)',
 '-',
 '-',
 '-',
 '(2)',
 '-',
 '-',
 '-',
 '-',
 '(1)',
 '-',
 '(2)',
 '-',
 '-',
 '-',
 '-']

In [88]:
# game_one_points = [int(point[1])
#                   for point in game_one_data[1::2] if point.strip('-')]

game_two_points = [int(point[1:-1]) for point in game_two_data[1::2] if point.strip('-')]

In [89]:
game_two_points, sum(game_two_points)

([2, 1, -1, 2, 1, 2], 7)

In [90]:
game_three = table_text[40]

In [91]:
game_three_data = clean_data(game_three)

In [92]:
game_three_points = [int(point[1:-1])
                     for point in game_three_data[1::2] if point.strip('-')]

In [93]:
game_three_points, sum(game_three_points)

([2, -1, 1, 1, 1], 4)

In [101]:
sum(game_one_points)

5

- Process will be
- Go through each of the second half of the season for the points totals to parse points
- will need to capture both the counts totals for the categories, and the points totals
- will want to model the counts totals and then have rules-based to calculate the expected fantasy points from it

In [127]:
# rows 36 is the header column, so the rows of statistical categories and points associated are separated by
# 37 rows in the data, but to line them up, needs to be n + 37
table_text[36]

'\n\nMinutes played\n\t\t\t\t\t\t\tMIN\n\t\t\t\t\t\t\n\nGoals\n\t\t\t\t\t\t\tGF\n\t\t\t\t\t\t\n\nGoal assist\n\t\t\t\t\t\t\tA\n\t\t\t\t\t\t\n\nClean Sheet\n\t\t\t\t\t\t\tCS\n\t\t\t\t\t\t\n\nPenalty save\n\t\t\t\t\t\t\tPS\n\t\t\t\t\t\t\n\nPenalty earned\n\t\t\t\t\t\t\tPE\n\t\t\t\t\t\t\n\nPenalty miss\n\t\t\t\t\t\t\tPM\n\t\t\t\t\t\t\n\nGoals Conceded\n\t\t\t\t\t\t\tGA\n\t\t\t\t\t\t\n\nSaves\n\t\t\t\t\t\t\tSV\n\t\t\t\t\t\t\n\nYellow card\n\t\t\t\t\t\t\tY\n\t\t\t\t\t\t\n\nRed card\n\t\t\t\t\t\t\tR\n\t\t\t\t\t\t\n\nOwn goal\n\t\t\t\t\t\t\tOG\n\t\t\t\t\t\t\n\nTackles\n\t\t\t\t\t\t\tT\n\t\t\t\t\t\t\n\nPasses\n\t\t\t\t\t\t\tP\n\t\t\t\t\t\t\n\nKey Pass\n\t\t\t\t\t\t\tKP\n\t\t\t\t\t\t\n\nCrosses\n\t\t\t\t\t\t\tCRS\n\t\t\t\t\t\t\n\nBig Chance created\n\t\t\t\t\t\t\tBC\n\t\t\t\t\t\t\n\n Clearance\n\t\t\t\t\t\t\tCL\n\t\t\t\t\t\t\n\nBlocked passes\n\t\t\t\t\t\t\tBLK\n\t\t\t\t\t\t\n\nInterceptions\n\t\t\t\t\t\t\tINT\n\t\t\t\t\t\t\n\nRecovered balls\n\t\t\t\t\t\t\tBR\n\t\t\t\t\t\t\n\nError leading to 

In [113]:
clean_data(table_text[36])

['Minutes played',
 'MIN',
 'Goals',
 'GF',
 'Goal assist',
 'A',
 'Clean Sheet',
 'CS',
 'Penalty save',
 'PS',
 'Penalty earned',
 'PE',
 'Penalty miss',
 'PM',
 'Goals Conceded',
 'GA',
 'Saves',
 'SV',
 'Yellow card',
 'Y',
 'Red card',
 'R',
 'Own goal',
 'OG',
 'Tackles',
 'T',
 'Passes',
 'P',
 'Key Pass',
 'KP',
 'Crosses',
 'CRS',
 'Big Chance created',
 'BC',
 ' Clearance',
 'CL',
 'Blocked passes',
 'BLK',
 'Interceptions',
 'INT',
 'Recovered balls',
 'BR',
 'Error leading to a goal',
 'ELG',
 'Own Goals Assist',
 'OGA',
 'Shots',
 'SH',
 'Was Fouled',
 'WF']

In [116]:
# Here I am just going through the header for the table, and pairing the acronym visible in the table, and the
# actual title for the category when you hover the mouse over the acronym. I think I want to keep the acronyms 
# and then have a dict/list for refernce.

for category, acronym in zip(clean_data(table_text[36])[0::2], clean_data(table_text[36])[1::2]):
    print(category, acronym)

Minutes played MIN
Goals GF
Goal assist A
Clean Sheet CS
Penalty save PS
Penalty earned PE
Penalty miss PM
Goals Conceded GA
Saves SV
Yellow card Y
Red card R
Own goal OG
Tackles T
Passes P
Key Pass KP
Crosses CRS
Big Chance created BC
 Clearance CL
Blocked passes BLK
Interceptions INT
Recovered balls BR
Error leading to a goal ELG
Own Goals Assist OGA
Shots SH
Was Fouled WF


In [125]:
#clean_data(table_text[72])
clean_data(table_text[35]) + clean_data(table_text[72])[0::2]

['all',
 '-',
 '2538',
 '6',
 '5',
 '9',
 '0',
 '0',
 '0',
 '41',
 '0',
 '4',
 '1',
 '0',
 '49',
 '1583',
 '33',
 '12',
 '5',
 '37',
 '17',
 '19',
 '198',
 '0',
 '0',
 '31',
 '62']

In [128]:
len(clean_data(table_text[35]) + clean_data(table_text[72])[0::2])

27

In [146]:
clean_data(table_text[0]) + clean_data(table_text[36])[1::2]

['RD',
 'OPPONENT',
 'PTS',
 'MIN',
 'GF',
 'A',
 'CS',
 'PS',
 'PE',
 'PM',
 'GA',
 'SV',
 'Y',
 'R',
 'OG',
 'T',
 'P',
 'KP',
 'CRS',
 'BC',
 'CL',
 'BLK',
 'INT',
 'BR',
 'ELG',
 'OGA',
 'SH',
 'WF']

#### Now we will make a dataframe with Christian Roldan's data. This is essentially the cleaning step for when the data is downloaded. I will have to think more about how all of this is stored.

* Of note will be cleaning up the column names and fixing the case when there are negative points for the week and an extra column is added.

In [229]:
cr_data = [clean_data(table_text[i]) + clean_data(table_text[i + 37])[0::2] for i in range(1,36)]    

In [230]:
def update_negative_scores(game: List) -> List:
    '''Check row of data for a hyphen to indicate a negative score, and then update the row for
    the negative score for the game, and correct the length of the row to match the rest of the table.
    '''
    if game[3] == '-':
        game[4] = '-' + game[4]
        game[3:] = game[4:]
    return game

In [231]:
row8 = cr_data[8]

In [232]:
row8

['9',
 'vs',
 'Los Angeles FC',
 '-',
 '2',
 '18',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '2',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [233]:
update_negative_scores(row8)

['9',
 'vs',
 'Los Angeles FC',
 '-2',
 '18',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '2',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [234]:
len(row8), len(cr_data[3])

(29, 29)

In [235]:
row7 = cr_data[7]

In [236]:
row7, len(row7)

(['9',
  'vs',
  'San Jose Earthquakes',
  '3',
  '90',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '2',
  '0',
  '0',
  '0',
  '0',
  '4',
  '56',
  '0',
  '1',
  '0',
  '1',
  '0',
  '0',
  '2',
  '0',
  '0',
  '1',
  '0'],
 29)

In [237]:
update_negative_scores(row7)

['9',
 'vs',
 'San Jose Earthquakes',
 '3',
 '90',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '4',
 '56',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '2',
 '0',
 '0',
 '1',
 '0']

In [238]:
len(_)

29

In [200]:
cr_data[8]

['9',
 'vs',
 'Los Angeles FC',
 '-',
 '2',
 '18',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '2',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [195]:
row8[4:] == cr_data[8][4:]

True

In [184]:
for game in cr_data:
    print(game[3], game[4], game[5], type(game[3]), type(game), len(game))

5 90 0 <class 'str'> <class 'list'> 29
7 90 0 <class 'str'> <class 'list'> 29
4 90 0 <class 'str'> <class 'list'> 29
8 90 0 <class 'str'> <class 'list'> 29
7 90 0 <class 'str'> <class 'list'> 29
9 90 1 <class 'str'> <class 'list'> 29
5 90 0 <class 'str'> <class 'list'> 29
3 90 0 <class 'str'> <class 'list'> 29
- 2 18 <class 'str'> <class 'list'> 30
10 90 1 <class 'str'> <class 'list'> 29
9 90 1 <class 'str'> <class 'list'> 29
3 90 0 <class 'str'> <class 'list'> 29
4 90 0 <class 'str'> <class 'list'> 29
5 90 0 <class 'str'> <class 'list'> 29
2 90 0 <class 'str'> <class 'list'> 29
0 - - <class 'str'> <class 'list'> 29
0 - - <class 'str'> <class 'list'> 29
0 - - <class 'str'> <class 'list'> 29
0 - - <class 'str'> <class 'list'> 29
8 90 0 <class 'str'> <class 'list'> 29
6 90 0 <class 'str'> <class 'list'> 29
4 90 0 <class 'str'> <class 'list'> 29
3 90 0 <class 'str'> <class 'list'> 29
6 90 0 <class 'str'> <class 'list'> 29
3 90 0 <class 'str'> <class 'list'> 29
7 90 0 <class 'str'> <class 

In [136]:
df = pd.DataFrame(cr_data)

In [148]:
col = clean_data(table_text[0]) + clean_data(table_text[36])[1::2]

In [151]:
col.append('n/a')

In [154]:
col.append('n/a 2')

In [155]:
df.columns = col

In [160]:
df.head()

Unnamed: 0,RD,OPPONENT,PTS,MIN,GF,A,CS,PS,PE,PM,...,CL,BLK,INT,BR,ELG,OGA,SH,WF,n/a,n/a 2
0,1,vs,FC Cincinnati,5,90,0,0,0,0,0,...,0,2,1,2,3,0,0,1,0,
1,2,vs,Colorado Rapids,7,90,0,0,1,0,0,...,0,1,2,1,15,0,0,1,3,
2,3,@,Chicago Fire,4,90,0,0,0,0,0,...,0,0,2,0,10,0,0,0,3,
3,5,@,Vancouver Whitecaps FC,8,90,0,0,1,0,0,...,0,2,1,0,18,0,0,1,3,
4,6,vs,Real Salt Lake,7,90,0,1,1,0,0,...,0,1,1,1,8,0,0,2,1,


In [166]:
col[2:] = col[1:-1]

In [168]:
col[1] = 'HOME/AWAY'

In [170]:
df.columns = col

In [172]:
df

Unnamed: 0,RD,HOME/AWAY,OPPONENT,PTS,MIN,GF,A,CS,PS,PE,...,BC,CL,BLK,INT,BR,ELG,OGA,SH,WF,n/a
0,1,vs,FC Cincinnati,5,90,0,0,0,0,0,...,0,2,1,2,3,0,0,1,0,
1,2,vs,Colorado Rapids,7,90,0,0,1,0,0,...,0,1,2,1,15,0,0,1,3,
2,3,@,Chicago Fire,4,90,0,0,0,0,0,...,0,0,2,0,10,0,0,0,3,
3,5,@,Vancouver Whitecaps FC,8,90,0,0,1,0,0,...,0,2,1,0,18,0,0,1,3,
4,6,vs,Real Salt Lake,7,90,0,1,1,0,0,...,0,1,1,1,8,0,0,2,1,
5,7,vs,Toronto FC,9,90,1,0,0,0,0,...,1,1,0,0,5,0,0,3,0,
6,8,@,Los Angeles FC,5,90,0,0,0,0,0,...,0,1,0,0,6,0,0,0,5,
7,9,vs,San Jose Earthquakes,3,90,0,0,0,0,0,...,0,1,0,0,2,0,0,1,0,
8,9,vs,Los Angeles FC,-,2,18,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0.0
9,10,@,Minnesota United FC,10,90,1,0,0,0,0,...,0,2,0,0,7,0,0,2,6,
