In [2]:
# set up imports and load data
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import statsmodels.api as sm
import statsmodels.stats.api as sms
import scipy.stats as stats
import pandas as pd
import os
import sys
sys.path.append('..')
from constants import *

CUR_YEAR = 2015
CUR_WEEK = 9

quants={'STD', 'SKEW', 'PZ'}
hf = pd.concat([pd.read_pickle('../data/histdata').dropna(), pd.read_pickle('../data/curprojs')])
hf['Opp']=hf.apply(lambda r: r['Matchup'].split(' ')[2] if isinstance(r['Matchup'],str) else 'NA',axis=1)
sals = pd.read_csv('../data/DKSalariesCurrent.csv')
sals['PID'] = sals.apply(lambda r: generate_pid(r['Name'], r['Position']),axis=1)
sals['Week'] = CUR_WEEK
sals['Year'] = CUR_YEAR
hf = hf.merge(sals[['PID', 'Week', 'Year', 'Salary']], on=['PID', 'Week', 'Year'], how='left')
hf['Salary_x'] = hf.apply(lambda r: r['Salary_x'] if not np.isnan(r['Salary_x']) else r['Salary_y'], axis=1)
hf.drop('Salary_y', axis=1)
hf.rename(columns={'Salary_x':'Salary'}, inplace=True)
pids = hf['PID'].unique()
pos = hf['Pos'].unique()
nums = dict([(pid, hf[hf['PID']==pid].shape[0]) for pid in pids])

In [3]:
# calc sample mean, skew and std for every player with 10+ games in our dataset
pf = pd.DataFrame(pids, columns=['PID'])
pf['N'] = pf.PID.apply(lambda r: sum(hf.PID==r))
pf['Pos'] = pf.PID.apply(lambda r: r.split('_')[-1])
pf=pf[pf.N >= 10]
for i,r in pf.iterrows():
    dat=hf[hf.PID==r.PID]['Points'].dropna().values
    pf.set_value(i, 'PZ', 1-np.count_nonzero(dat)*1./dat.size)
    dat = dat[dat != 0]
    pf.set_value(i, 'MEAN', dat.mean())
    pf.set_value(i, 'SKEW', stats.skew(dat))
    pf.set_value(i, 'STD', dat.std()*np.sqrt(dat.size*1./(dat.size-1))) #unbiased estimate

In [118]:
# fit and output functions that model skew, stdev, and pctzero dependent on ppg
outp=['Pos,Quantity,2,1,0\n']
for p in pos:
    thispos = pf[pf['Pos']==p]
    std_func=np.poly1d(np.polyfit(thispos['MEAN'], thispos['STD'],2))
    skew_func=np.poly1d(np.polyfit(thispos['MEAN'], thispos['SKEW'],2))
    pz_func=np.poly1d(np.polyfit(thispos['MEAN'], thispos['PZ'],2))
    outp.append('{},STD,{},{},{}\n'.format(p, *std_func.c))
    outp.append('{},SKEW,{},{},{}\n'.format(p, *skew_func.c))
    outp.append('{},PZ,{},{},{}\n'.format(p, *pz_func.c if len(pz_func.c)==3 else [0,0,0]))
with open('params.csv', 'w') as wr:
    wr.writelines(outp)

In [4]:
# read params from file (to allow for manual adjustments which are necessary as some of the fits)
# are really bad on the high/low sides
# creates poly1d object to calculate stdev, skew, and pctzero as a function of ppg
# upper_extr prevents the function from sloping back down (or up) after hitting an extremum
ff=pd.read_csv('params.csv')
funcs={}
def upper_extr(poly,mn,mx):
    c=poly.c
    if len(c) != 3 or c[0]==0:
        return poly
    x_extr=-0.5*c[1]/c[0]
    val_extr=poly(x_extr)
    if x_extr<mn or x_extr>mx:
        return poly
    return lambda lis: map(lambda x: val_extr if x > x_extr else poly(x), lis)
    
for p in pos:
    funcs[p]={}
    thispos=pf[pf['Pos']==p]
    mn=thispos['MEAN'].min()
    mx=thispos['MEAN'].max()
    for q in quants:
        funcs[p][q]=upper_extr(np.poly1d(ff[ff['Pos']==p][ff['Quantity']==q][['2','1','0']].values[0]),mn,mx)
        """print p, q
        x = np.linspace(mn, mx, 200)
        y = funcs[p][q](x)
        plt.scatter(thispos['MEAN'], thispos[q])
        plt.plot(x, y)
        plt.show()"""



In [5]:
# this loads player point projections and raw correlations from file, and creates a big correl matrix
# players who appear later in the list get their skew adjusted downwards by the cholesky correl operation
projs=pd.read_csv(os.path.expanduser('~/Dropbox/DFS/data/contests/Week{}/playervals.csv'.format(CUR_WEEK)))
corrf=pd.read_csv(os.path.expanduser('~/Dropbox/DFS/data/players/pos_correls.csv'))
projs.columns=['PID','points']
hf1=hf[hf['Week']==CUR_WEEK][hf['Year']==CUR_YEAR].merge(projs, on=['PID'])[['PID','Pos','Team','Opp','points']]
# adds each player's projected (from the function) stdev, skew and pz to the dataframe hf1
for i, r in hf1.iterrows():
    for q in quants:
        hf1.set_value(i, q, funcs[r['Pos']][q]([r['points']])[0])
hf1=hf1.merge(sals[['PID', 'Salary']], on=['PID'])
hf1['PPD'] = hf1.apply(lambda r: r['points'] / r['Salary'], axis=1)
nplyrs = hf1.shape[0]
# creates big correlation matrix
correls=np.zeros((nplyrs, nplyrs))
for i in xrange(nplyrs):
    correls[i,i]=1
    for j in xrange(i+1,nplyrs):
        p1,p2=hf1.loc[i], hf1.loc[j]
        # if they're same team and they're not the same, and neither is DST, OR
        # if opponents and one is DST but not both DST
        if (p1['Team']==p2['Team'] and p1['Pos'] != p2['Pos'] and p1['Pos'] != 'DST' and p2['Pos'] != 'DST') \
            or (p1['Team']==p2['Opp'] and (p1['Pos']=='DST' or p2['Pos']=='DST') and not ((p1['Pos']=='DST' and p2['Pos']=='DST'))):
            correls[i,j]=corrf[corrf['POS_PR']==p1['Pos'] + '_' + p2['Pos']]['CORR'].values[0]
            correls[j,i]=correls[i,j]
#np.savetxt(os.path.expanduser('~/Dropbox/DFS/data/players/correls.csv'), correls, delimiter=',', fmt='%.4f')
# does monte carlo sample from skewed normal, and induces correlation using cholesky decomposition
import skew_normal as sn
mc_num=100000
samples=[]
player_idx = {}
for i, row in hf1.iterrows():
    sample = sn.random_skewnormal(mean=0., stdev=1., 
                                  skew=max(-0.98,min(0.98,row['SKEW'])), size=(mc_num))
    bern = np.random.binomial(1,1-max(row['PZ'],0.),size=(mc_num))
    samples.append(np.multiply(sample, bern).reshape(-1,1))
    player_idx[row['PID']] = i
indep = np.hstack(samples)
import scipy.linalg as linalg
cholesky = linalg.cholesky(correls)
induc = np.dot(indep, cholesky)
for c in xrange(induc.shape[1]):
    plyr=hf1.loc[c]
    induc[:,c] = induc[:,c] * plyr['STD'] + plyr['points']

In [None]:
# this segment randomly samples rosters. it's a terrible idea, the space is just too big
"""
import random
cutoff = 1.
num_lineups = 100000
pool={}
ranges={}
for p in pos:
    tmp = hf1[hf1['Pos']==p].sort_values('PPD', ascending=False)
    pool[p] = tmp.sort_values('PPD', ascending=False).iloc[0:int(tmp.shape[0]*cutoff)]['PID']
    ranges[p] = xrange(pool[p].shape[0])
pos_lims = map(lambda l: dict(pr for pr in l),
               [POSITION_LIMITS_RB_MAX, POSITION_LIMITS_TE_MAX, POSITION_LIMITS_WR_MAX])
lineups = []
for i in xrange(num_lineups):
    salary = 100000
    while (salary > 50000):
        lineup = [0] * nplyrs
        lim = pos_lims[np.random.randint(3)]
        for p in lim:
            for idx in map(lambda n: player_idx[n], list(pool[p].iloc[random.sample(ranges[p], lim[p])])):
                lineup[idx] = 1
        salary = sum(map(lambda i: lineup[i] * hf1.loc[i]['Salary'], range(nplyrs)))
        ev = sum(map(lambda i: lineup[i] * hf1.loc[i]['points'], range(nplyrs)))
    lineups.append(lineup)
lineups = np.vstack(lineups)
"""

49200 124.306329718
hi
50000 120.844825432
hi
49600 113.210709806
hi
49700 110.514394889


In [None]:
outcomes = np.dot(lineups, induc.transpose())

In [13]:
evs = []
for i in xrange(lineups.shape[0]):
    ev = 0
    for j, l in enumerate(lineups[i]):
        if l == 0:
            continue
        row = hf1.loc[j]
        ev += row['points']
    evs.append(ev)
print sorted(evs, reverse=True)[:100]

[131.25904533155193, 129.84304894927484, 128.2948112877877, 128.14444350966178, 127.81010943631179, 127.1237145064332, 127.05536856933766, 127.03348967086018, 126.81678828813347, 126.69021712293443, 126.4523754574931, 125.9534443427156, 125.90015564169273, 125.71736533015903, 125.65965381138446, 125.65910972226222, 125.63316350962459, 125.60375200015619, 125.58713757858214, 125.30557455828162, 125.26374888113764, 125.20804108620825, 124.92273701800482, 124.91482139373976, 124.82361966400205, 124.81779312378522, 124.64972557280463, 124.58954624110416, 124.52386250853645, 124.46145818806338, 124.32541509876179, 124.16078894935451, 124.15007696889795, 124.01862441970108, 123.98344819786072, 123.71716914665603, 123.65688697385363, 123.62101973426125, 123.60734086824642, 123.58846281100398, 123.46304028715657, 123.45867181243442, 123.27161047369682, 123.20639614129783, 123.12405293753544, 123.10385775107343, 123.05766249180431, 123.02486599333281, 122.92947672656419, 122.91921576080387, 122

In [14]:
hf1.columns

Index([u'PID', u'Pos', u'Team', u'Opp', u'points', u'PZ', u'STD', u'SKEW',
       u'Salary', u'PPD'],
      dtype='object')