In [1]:
!pip install pygmo

ERROR: Could not find a version that satisfies the requirement pygmo
ERROR: No matching distribution found for pygmo


In [3]:
from pathlib import Path
import re

import numpy as np
import pandas as pd

try:
    import pygmo
except ImportError:
    pass

In [4]:
# specify the appropriate path
base = Path.home()

In [17]:
# useful for toggling display of data
DEBUG = 1
N_SIMULATIONS = 100
N_BEST_LINEUPS = 150

In [6]:
def fc_lineups_as_binary(df, mapping):
    """Converts dataframe of names to np.ndarray of binary values
       which correspond to indexes in the player projections dataframe"""
    idx = df.applymap(lambda x: mapping.get(x)).values
    return np.sum((np.arange(idx.max() + 1) == idx[...,None]-1).astype(int), axis=1)


def index_mapping(df, col='player'):
    """Creates dict of column_value: index value
       pandas returns dict of index to column value
       so we have to reverse it to get correct mapping
    """
    return {v: k for k, v in df.loc[:, [col]].to_dict()[col].items()}


def parse_fc_names(s):
    """Takes a string and returns a tuple of int, str"""
    patt = re.compile(r'(.*?)[(]{1}(\d+)[)]{1}')
    m = re.search(patt, s)
    name, pid = m.groups()
    return int(pid), name.strip()
    

def process_fc_lineups(fn):
    """Processes lineup file from fantasycruncher
    
    Args:
        fn (str): the full path of the lineup file
        
    Returns:
    
    """   
    # each cell in lineup file has names in format John Smith (111334)
    # we want to convert this to a str
    df = pd.read_csv(fn)
    return df.applymap(lambda x: parse_fc_names(x)[1])


In [7]:
# STEP ONE: load players
# has columns for player, mean, std
playerfn = base / 'mlbprojections.csv'
players = pd.read_csv(playerfn)

if DEBUG:
    display(players)

Unnamed: 0,player,mean,std
0,Mookie Betts,11.043333,8.069701
1,Bryce Harper,10.113333,7.758110
2,Corey Seager,9.800000,7.024382
3,Shohei Ohtani,9.163333,7.390410
4,Justin Turner,9.140000,7.079116
...,...,...,...
270,Casey Mize,11.466667,7.939764
271,Sam Hentges,9.730000,7.741987
272,Zach Davies,9.360000,7.942880
273,James Kaprielian,10.106667,7.697991


In [8]:
# STEP TWO: read in lineups
lineupfn = base / 'Crunch_1_20210512.csv'
lineups = process_fc_lineups(lineupfn)

if DEBUG:
    display(lineups)

Unnamed: 0,P,P.1,C,1B,2B,3B,SS,OF,OF.1,OF.2
0,Danny Duffy,Max Fried,Andrew Knapp,Rhys Hoskins,Marwin Gonzalez,Alec Bohm,Xander Bogaerts,J.D. Martinez,Bryce Harper,Andrew McCutchen
1,Gerrit Cole,Julio Urias,Manny Pina,Dan Vogelbach,Jose Altuve,Alex Bregman,Carlos Correa,Lorenzo Cain,Kyle Tucker,Chas McCormick
2,Gerrit Cole,Brandon Woodruff,Danny Jansen,Shohei Ohtani,Marcus Semien,Philip Gosselin,Jose Iglesias,Jared Walsh,Teoscar Hernandez,Taylor Ward
3,Gerrit Cole,Max Fried,Yasmani Grandal,Jose Abreu,Jean Segura,Yoan Moncada,Danny Mendick,Bryce Harper,Andrew McCutchen,Andrew Vaughn
4,Gerrit Cole,Julio Urias,Danny Jansen,Carlos Santana,Marcus Semien,Cavan Biggio,Bo Bichette,Andrew Benintendi,Jorge Soler,Lourdes Gurriel
...,...,...,...,...,...,...,...,...,...,...
1417,Gerrit Cole,Brandon Woodruff,Sean Murphy,Matt Olson,Jean Segura,Alec Bohm,Elvis Andrus,Bryce Harper,Mark Canha,Stephen Piscotty
1418,Gerrit Cole,Danny Duffy,Christian Vazquez,Rhys Hoskins,Nick Madrigal,Jon Berti,Xander Bogaerts,J.D. Martinez,Hunter Renfroe,Franchy Cordero
1419,Gerrit Cole,Brandon Woodruff,Christian Vazquez,Mitch Moreland,Jordy Mercer,Starlin Castro,Trea Turner,Juan Soto,Andrew McCutchen,Andrew Stevenson
1420,Gerrit Cole,Brandon Woodruff,Drew Butera,Jared Walsh,Marcus Semien,Philip Gosselin,Jose Iglesias,Bryce Harper,Juan Soto,Juan Lagares


In [9]:
# STEP THREE: drop players that aren't in a lineup
# looks like 275 in pool, 160 used in lineups
players_in_lineup = np.unique(lineups.values)
players_used = (
  players
  .loc[players['player'].isin(players_in_lineup), :]
  .reset_index(drop=True)
)

if DEBUG:
    display(players_used)

Unnamed: 0,player,mean,std
0,Mookie Betts,11.043333,8.069701
1,Bryce Harper,10.113333,7.758110
2,Corey Seager,9.800000,7.024382
3,Shohei Ohtani,9.163333,7.390410
4,Justin Turner,9.140000,7.079116
...,...,...,...
155,Eduardo Rodriguez,16.400000,8.117445
156,Hyun-Jin Ryu,16.210000,8.140971
157,Max Fried,15.590000,7.986163
158,Dallas Keuchel,12.170000,7.978417


In [10]:
# STEP FOUR: create name-index mapping
player_mapping = index_mapping(players_used, col='player')

if DEBUG:
    print(len(list(player_mapping.keys())))

160


In [11]:
# STEP FIVE: convert lineups to np.ndarray
# shape of lineup_ax_idx is (len(lineups), len(players_used))
# so in this example, it is (1422, 160)
lineup_matrix = fc_lineups_as_binary(lineups, player_mapping)

if DEBUG:
    print(lineup_matrix.shape)

(1422, 160)


In [13]:
# STEP SIX: get randomized projections
# projection ranges is np.ndarray of shape (n_simulations, len(players_used))
projection_ranges = np.random.normal(players_used['mean'], 
                                     players_used['std'], 
                                     size=(N_SIMULATIONS, len(players_used)))

if DEBUG:
    print(projection_ranges.shape)

(100, 160)


In [14]:
# STEP SEVEN: get lineup scores with randomized projections
lineup_scores = np.array([np.sum((lineup_matrix[i,:] * projection_ranges), axis=1)
                 for i in np.arange(lineup_matrix.shape[0])])

if DEBUG:
    print(lineup_scores.shape)

(1422, 100)


In [15]:
# STEP EIGHT: summarize lineups
lusumm = pd.DataFrame({'lu_mean': np.mean(lineup_scores, axis=1), 
                       'lu_75': np.percentile(lineup_scores, 75, axis=1),
                       'lu_max': np.max(lineup_scores, axis=1)})

if DEBUG:
    display(lusumm)

Unnamed: 0,lu_mean,lu_75,lu_max
0,101.111293,117.457761,156.233146
1,82.810097,97.764073,138.916748
2,92.829803,106.607806,157.546521
3,85.506175,97.620616,145.100660
4,81.384206,93.754038,122.483062
...,...,...,...
1417,96.318335,114.437308,144.365210
1418,81.951933,96.898150,138.649561
1419,87.666509,101.721442,125.790100
1420,90.791572,106.900639,162.812925


In [18]:
# STEP NINE: pick 150 best lineups
try:
    idx_best = pygmo.select_best_N_mo(0 - lusumm.values, 150)
except NameError:
    print('pygmo not installed - Display top lineups by average\n\n')
    idx_best = (-lusumm['lu_mean'].values).argsort()[:N_BEST_LINEUPS]
display(pd.concat([lineups.loc[idx_best], lusumm.loc[idx_best]], axis=1))

pygmo not installed - Display top lineups by average




Unnamed: 0,P,P.1,C,1B,2B,3B,SS,OF,OF.1,OF.2,lu_mean,lu_75,lu_max
41,Brandon Woodruff,Hyun-Jin Ryu,Andrew Knapp,Rhys Hoskins,Jean Segura,Rafael Devers,Didi Gregorius,Bryce Harper,Alex Verdugo,Franchy Cordero,113.341038,126.194040,172.796683
973,Brandon Woodruff,Zack Wheeler,Andrew Knapp,Jesus Aguilar,Isan Diaz,Hunter Dozier,Miguel Rojas,Corey Dickerson,Adam Duvall,Pavin Smith,111.618252,124.500923,162.354694
826,Brandon Woodruff,Julio Urias,Martin Maldonado,Rhys Hoskins,Jose Altuve,Alex Bregman,Carlos Correa,Teoscar Hernandez,Kyle Tucker,Franchy Cordero,111.303289,125.215223,166.889530
1194,Brandon Woodruff,Julio Urias,Martin Maldonado,Pavin Smith,Jean Segura,Alex Bregman,Carlos Correa,Yordan Alvarez,Kyle Tucker,Taylor Ward,111.027849,124.645240,166.587531
831,Brandon Woodruff,Danny Duffy,Stephen Vogt,Asdrubal Cabrera,Marwin Gonzalez,Rafael Devers,Tyler Wade,J.D. Martinez,Alex Verdugo,Franchy Cordero,110.109579,125.720168,178.940464
...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,Gerrit Cole,Brandon Woodruff,Drew Butera,Shohei Ohtani,Whit Merrifield,Alec Bohm,Jose Iglesias,Jared Walsh,Andrew McCutchen,Jon Jay,96.387023,110.413996,159.519287
1324,Gerrit Cole,Brandon Woodruff,Salvador Perez,Dan Vogelbach,Kolten Wong,Travis Shaw,Joshua Rojas,Juan Soto,Jackie Bradley Jr.,Avisail Garcia,96.379571,110.758078,152.284296
567,Gerrit Cole,Brandon Woodruff,Salvador Perez,Matt Olson,Jean Segura,Jeimer Candelario,Elvis Andrus,Ramon Laureano,Mark Canha,Stephen Piscotty,96.368911,112.601573,144.839180
886,Gerrit Cole,Brandon Woodruff,Jake Rogers,Asdrubal Cabrera,Isan Diaz,Rafael Devers,Xander Bogaerts,Alex Verdugo,Hunter Renfroe,Franchy Cordero,96.325434,108.777759,143.930713
