In [1]:
import bs4
import pandas as pd
import requests
import sqlite3
import time

In [2]:
# makc connection to db
try:
    con = sqlite3.connect('/Users/qlanners/projects/mlb_salary_comp_db/mlb_salary_comp.db')
except:
    print('Error connecting to db.')
    exit()

In [3]:
# match full team names to abbreviations
teams = {
    'Anaheim Angels': 'LAA',
    'Los Angeles Angels of Anaheim': 'LAA',
    'Los Angeles Angels': 'LAA',
    'California Angels': 'LAA',
    'Arizona Diamondbacks': 'ARI',
    'Atlanta Braves': 'ATL',
    'Baltimore Orioles': 'BAL',
    'Boston Red Sox': 'BOS',
    'Chicago Cubs': 'CHC',
    'Chicago White Sox': 'CHW',
    'Cincinnati Reds': 'CIN',
    'Cleveland Indians': 'CLE',
    'Colorado Rockies': 'COL',
    'Detroit Tigers': 'DET',
    'Florida Marlins': 'MIA',
    'Miami Marlins': 'MIA',
    'Houston Astros': 'HOU',
    'Kansas City Royals': 'KCR',
    'Los Angeles Dodgers': 'LAD',
    'Milwaukee Brewers': 'MIL',
    'Minnesota Twins': 'MIN',
    'Montreal Expos': 'MON',
    'New York Mets': 'NYM',
    'New York Yankees': 'NYY',
    'Oakland Athletics': 'OAK',
    'Philadelphia Phillies': 'PHI',
    'Pittsburgh Pirates': 'PIT',
    'San Diego Padres': 'SDP',
    'Seattle Mariners': 'SEA',
    'San Francisco Giants': 'SFG',
    'St. Louis Cardinals': 'STL',
    'Tampa Bay Devil Rays': 'TBR',
    'Tampa Bay Rays': 'TBR',
    'Texas Rangers': 'TEX',
    'Toronto Blue Jays': 'TOR',
    'Washington Nationals': 'WSN'
}

In [4]:
# match old abbrevs to current
teams_abbrevs = {
     'SEA': 'SEA',
     'ANA': 'LAA',
     'DET': 'DET',
     'TOR': 'TOR',
     'OAK': 'OAK',
     'BOS': 'BOS',
     'CHW': 'CHW',
     'KCR': 'KCR',
     'CLE': 'CLE',
     'MIN': 'MIN',
     'BAL': 'BAL',
     '2TM': '2TM',
     'NYY': 'NYY',
     'TEX': 'TEX',
     'TBD': 'TBR',
     'MIL': 'MIL',
     'LAD': 'LAD',
     'CHC': 'CHC',
     'PHI': 'PHI',
     'FLA': 'MIA',
     'SDP': 'SDP',
     'ARI': 'ARI',
     'PIT': 'PIT',
     'STL': 'STL',
     'MON': 'MON',
     'COL': 'COL',
     '3TM': '3TM',
     'NYM': 'NYM',
     'CIN': 'CIN',
     'HOU': 'HOU',
     'ATL': 'ATL',
     'SFG': 'SFG',
     'LAA': 'LAA',
     'WSN': 'WSN',
     'TBR': 'TBR',
     'MIA': 'MIA',
     '4TM': '4TM'
}

In [5]:
def make_camel_case(cols):
    """Takes a list of column names and turns them into camelCase"""
    return [col[0].lower() + col[1:] for col in 
            [''.join(c.lower().replace('_', ' ').title().replace(' ','')) for c in cols]
           ]

In [6]:
# bbr meta
meta = pd.read_csv('bbr/data/meta.csv')
meta.columns = make_camel_case(meta.columns)
meta['birthdate'] = pd.to_datetime(meta['birthdate'])
meta['team'] = [teams[t] for t in meta['team']]
meta.to_sql('meta', con, index=False)

In [7]:
# bbe free_agents
fa = pd.read_csv('bbr/data/free_agents.csv')
fa.columns = make_camel_case(fa.columns)
fa['arrivalDt'] = pd.to_datetime(fa['arrivalDt'])
fa['birthdate'] = pd.to_datetime(fa['birthdate'])
fa['freeAgentDt'] = pd.to_datetime(fa['freeAgentDt'], errors='coerce')
fa['seasonStartDt'] = pd.to_datetime(fa['seasonStartDt'])
fa['wasDrafted'] = fa['wasDrafted'] == 'yes'
fa['isFreeAgentNow'] = fa['isFreeAgentNow'] == 'yes'
fa.to_sql('free_agents', con, index=False)

In [8]:
# bbr player_value_batting
pv_batting = pd.read_csv('bbr/data/player_value_batting.csv')
pv_batting.columns = make_camel_case(pv_batting.columns)
pv_batting.rename(columns={'162Wl%': 'winLossAvgSeason', 'tm': 'team', 'waawl%': 'waawlPerc'}, inplace=True)
pv_batting['team'] = [teams_abbrevs[t] for t in pv_batting['team']]
pv_batting.to_sql('pv_batting', con, index=False)

In [9]:
# bbr player_value_pitching
pv_pitching = pd.read_csv('bbr/data/player_value_pitching.csv')
pv_pitching.columns = make_camel_case(pv_pitching.columns)
pv_pitching.rename(columns={'162Wl%': 'winLossAvgSeason', 'tm': 'team', 'waawl%': 'waawlPerc'}, inplace=True)
pv_pitching['team'] = [teams_abbrevs[t] for t in pv_pitching['team']]
pv_pitching.to_sql('pv_pitching', con, index=False)

In [10]:
# baseball almanac salaries
bba_salaries = pd.read_csv('baseball_almanac/data/salaries.csv')
bba_salaries.columns = make_camel_case(bba_salaries.columns)
bba_salaries['team'] = [teams[t] for t in bba_salaries['team']]
bba_salaries['allStar'] = bba_salaries['allStar'] == 'yes'
bba_salaries['worldSeries'] = bba_salaries['worldSeries'] == 'yes'
bba_salaries.to_sql('bba_salaries', con, index=False)

In [11]:
# update games_batting game logs
games_batting = pd.read_sql_query("SELECT * FROM games_batting", con)
games_batting.columns = make_camel_case(games_batting.columns)
games_batting.rename(columns={'playerId': 'nameKey', 'unnamed:5': 'away', 
                              '2B': 'double', '3B': 'triple', 'tm': 'team',
                              'dfs(Dk)': 'dfsDk', 'dfs(Fd)': 'dfsFd'}, inplace=True)

In [12]:
# check all team abbrevs are covered
if len([t for t in games_batting['team'].unique() if t not in teams_abbrevs.keys()]) > 0:
    print('CAUTION: TEAM MISMATCH')

In [13]:
# break date into month and date
gb_dates = [d.split(' ') for d in games_batting['date']]
games_batting['month'] = [d[0] for d in gb_dates]
games_batting['day'] = [d[1].replace('\xa0susp','') for d in gb_dates]

# break result into seperate columns
gb_rslt = [d.split(',') for d in games_batting['rslt']]
gb_rslt = [[g[0]] + g[1].split('-') for g in gb_rslt]
games_batting['result'] = [d[0] for d in gb_rslt]
games_batting['teamRuns'] = [d[1] for d in gb_rslt]
games_batting['oppRuns'] = [d[2] for d in gb_rslt]

games_batting.drop(['index', 'date', 'rslt'], axis=1, inplace=True)

In [14]:
# convert column data types
cols_to_ints = ['day', 'rk', 'gcar', 'teamRuns', 'oppRuns', 'pa', 'ab', 'r', 'h', 'double', 'triple', 'hr', 'rbi', 'bb', 'ibb', 
                'so', 'hbp', 'sh', 'sf', 'roe', 'gdp', 'sb', 'cs', 'bop']
cols_to_floats = ['ba', 'obp', 'slg', 'ops', 'bop', 'ali', 'wpa', 're24', 'dfs(Dk)', 'dfs(Fd)']

for c in cols_to_ints:
    games_batting[c] = pd.to_numeric(games_batting[c], downcast='integer')

for c in cols_to_floats:
    games_batting[c] = pd.to_numeric(games_batting[c])    

In [15]:
# reorder cols
gb_col_order = ['nameKey', 'year', 'month', 'day', 'rk', 'gcar', 'gtm', 
                'team', 'away', 'opp', 'result', 'teamRuns', 'oppRuns', 'inngs',
                'pa', 'ab', 'r', 'h', 'double', 'triple', 'hr', 'rbi', 'bb', 'ibb',
                'so', 'hbp', 'sh', 'sf', 'roe', 'gdp', 'sb', 'cs', 'ba', 'obp', 'slg',
                'ops', 'bop', 'ali', 'wpa', 're24', 'dfs(Dk)', 'dfs(Fd)', 'pos',]
games_batting = games_batting[gb_col_order]
games_batting.to_sql('games_batting_new', con, index=False)

In [16]:
# update games_fielding game logs
games_fielding = pd.read_sql_query("SELECT * FROM games_fielding", con)
games_fielding.columns = make_camel_case(games_fielding.columns)
games_fielding.rename(columns={'playerId': 'nameKey', 'unnamed:4': 'away',
                               'tm': 'team', 'inn': 'field_inngs'}, inplace=True)

In [17]:
# check all team abbrevs are covered
if len([t for t in games_fielding['team'].unique() if t not in teams_abbrevs.keys()]) > 0:
    print('CAUTION: TEAM MISMATCH')

In [18]:
# break date into month and date
gf_dates = [d.split(' ') for d in games_fielding['date']]
games_fielding['month'] = [d[0] for d in gf_dates]
games_fielding['day'] = [d[1].replace('\xa0susp','') for d in gf_dates]

# break result into seperate columns
gf_rslt = [d.split(',') for d in games_fielding['rslt']]
gf_rslt = [[g[0]] + g[1].split('-') for g in gf_rslt]
games_fielding['result'] = [d[0] for d in gf_rslt]
games_fielding['teamRuns'] = [d[1] for d in gf_rslt]
games_fielding['oppRuns'] = [d[2] for d in gf_rslt]

games_fielding.drop(['index', 'date', 'rslt'], axis=1, inplace=True)

In [19]:
# convert column data types
cols_to_ints = ['day', 'rk', 'teamRuns', 'oppRuns', 'bf', 'po', 'a', 'e', 'ch', 'dp']
cols_to_floats = ['field_inngs']

for c in cols_to_ints:
    games_fielding[c] = pd.to_numeric(games_fielding[c], downcast='integer')

for c in cols_to_floats:
    games_fielding[c] = pd.to_numeric(games_fielding[c])    

In [20]:
# reorder cols
gf_col_order = ['nameKey', 'year', 'month', 'day', 'rk', 'gtm', 'team', 'away', 'opp', 
                'result', 'teamRuns', 'oppRuns', 'inngs', 'bf', 'field_inngs', 
                'po', 'a', 'e', 'ch', 'dp', 'pos']
games_fielding = games_fielding[gf_col_order]
games_fielding.to_sql('games_fielding_new', con, index=False)

In [21]:
# update games_pitching game logs
games_pitching = pd.read_sql_query("SELECT * FROM games_pitching", con)
games_pitching.columns = make_camel_case(games_pitching.columns)
games_pitching.rename(columns={'playerId': 'nameKey', 'unnamed:5': 'away', 
                              '2B': 'double', '3B': 'triple', 'tm': 'team',
                              'dfs(Dk)': 'dfsDk', 'dfs(Fd)': 'dfsFd'}, inplace=True)

In [22]:
# check all team abbrevs are covered
if len([t for t in games_pitching['team'].unique() if t not in teams_abbrevs.keys()]) > 0:
    print('CAUTION: TEAM MISMATCH')

In [23]:
# break date into month and date
gp_dates = [d.split(u'\xa0') for d in games_pitching['date']]
games_pitching['month'] = [d[0] for d in gp_dates]
games_pitching['day'] = [d[1].split('(')[0] for d in gp_dates]

# break result into seperate columns
gp_rslt = [d.split(',') for d in games_pitching['rslt']]
gp_rslt = [[g[0]] + g[1].split('-') for g in gp_rslt]
games_pitching['result'] = [d[0] for d in gp_rslt]
games_pitching['teamRuns'] = [d[1] for d in gp_rslt]
games_pitching['oppRuns'] = [d[2] for d in gp_rslt]

# break decision into seperate columns
decisions = []
winsAfter = []
lossesAfter = []
for i,r in games_pitching.iterrows():
    this_dec = r['dec']
    if this_dec:
        decisions.append(this_dec.split('(')[0])
        winsAfter.append(this_dec.split('(')[1].split('-')[0].replace(')', ''))
        if '-' in this_dec:
            lossesAfter.append(this_dec.split('-')[1][:-1])
        else:
            lossesAfter.append(None)
    else:
        decisions.append(None)
        winsAfter.append(None)
        lossesAfter.append(None)
games_pitching['decision'] = decisions
games_pitching['winsAfter'] = winsAfter
games_pitching['lossesAfter'] = lossesAfter

games_pitching.drop(['index', 'date', 'rslt', 'dec'], axis=1, inplace=True)

In [24]:
# convert column data types
cols_to_ints = ['day', 'rk', 'gcar', 'teamRuns', 'oppRuns', 'winsAfter', 'lossesAfter', 
                'dr', 'h', 'r', 'er', 'bb', 'so', 'hr', 'hbp', 'bf', 'pit', 'str', 'stl', 'sts',
                'gb', 'fb', 'ld', 'pu', 'unk', 'gsc', 'ir', 'is', 'sb', 'cs', 'po', 'ab', 
                'double', 'triple', 'ibb', 'gdp', 'sf', 'roe']
cols_to_floats = ['ip', 'era', 'ali', 'wpa', 're24', 'dfs(Dk)', 'dfs(Fd)']

for c in cols_to_ints:
    games_pitching[c] = pd.to_numeric(games_pitching[c], downcast='integer')

for c in cols_to_floats:
    games_pitching[c] = pd.to_numeric(games_pitching[c], errors='coerce')    

In [25]:
# reorder cols
gp_col_order = ['nameKey', 'year', 'month', 'day', 'rk', 'gcar', 'gtm', 'team',
                'away', 'opp', 'result', 'teamRuns', 'oppRuns', 'decision',
                'winsAfter', 'lossesAfter', 'inngs', 'dr', 'ip', 'h', 'r', 'er',
                'bb', 'so', 'hr', 'hbp', 'era', 'bf', 'pit', 'str', 'stl', 'sts',
                'gb', 'fb', 'ld', 'pu', 'unk', 'gsc', 'ir', 'is', 'sb', 'cs', 
                'po', 'ab', 'double', 'triple', 'ibb', 'gdp', 'sf', 'roe', 'ali', 
                'wpa', 're24', 'dfs(Dk)', 'dfs(Fd)', 'entered', 'exited']
games_pitching = games_pitching[gp_col_order]
games_pitching.to_sql('games_pitching_new', con, index=False)