In [1]:
import pandas as pd
import os
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
import numpy as np 
import matplotlib

cwd = os.getcwd()
project_dir = Path(cwd).resolve().parents[0]
interim_data_dir = os.path.join(project_dir, 'data/interim/')
processed_data_dir = os.path.join(project_dir, 'data/processed/')
main_data = pd.read_pickle(Path(processed_data_dir) / 'main_data.pkl')

In [2]:
df = main_data.groupby(['spot', 'home'])['Win'].mean()

In [3]:
df.sort_values()

spot  home
9.0   1.0     0.386525
      0.0     0.389615
8.0   1.0     0.575659
      0.0     0.580675
7.0   1.0     0.617503
      0.0     0.619882
6.0   1.0     0.643859
      0.0     0.647692
5.0   0.0     0.668384
      1.0     0.669178
4.0   0.0     0.687553
      1.0     0.689345
2.0   0.0     0.695093
      1.0     0.696133
1.0   0.0     0.701226
3.0   0.0     0.702874
1.0   1.0      0.70522
3.0   1.0     0.706829
Name: Win, dtype: Float64

In [4]:
df2 = main_data.groupby(['spot'])['Win'].mean()

In [5]:
df2.sort_values()

spot
9.0    0.388087
8.0    0.578169
7.0    0.618693
6.0    0.645776
5.0    0.668781
4.0    0.688449
2.0    0.695613
1.0    0.703223
3.0    0.704851
Name: Win, dtype: Float64

In [6]:
df3 = main_data.groupby(['home'])['Win'].mean()

In [7]:
df3.sort_values()

home
0.0    0.633293
1.0     0.63357
Name: Win, dtype: Float64

In [12]:
df4 = main_data.groupby(['ParkID'])['Win'].mean()
print(df4.sort_values().to_string())

ParkID
FTB01         0.5
HON01    0.518519
SYD01    0.527778
JER02     0.52809
OMA01    0.555556
HOU01    0.578609
SAN02    0.582759
NYC20    0.585174
LOS03     0.59416
WAS10    0.595894
MIA02    0.598803
TOK01         0.6
NYC17    0.600426
SFO02    0.601208
HOU02    0.602617
SEA01    0.603283
SFO03    0.605619
SAN01      0.6063
MIL06    0.607326
BAL11     0.60748
MON01    0.607686
SFO01      0.6102
MON02    0.611149
STL10    0.612578
CIN09    0.613791
SJU01    0.614155
WAS11    0.615474
MIA01    0.616139
KAN05    0.616162
ATL02    0.616292
PHI12    0.616326
CIN08    0.616552
ATL03    0.616861
PIT08    0.617956
LOS01    0.617963
PHI13    0.618156
OAK01    0.619766
STL09    0.620403
CLE07    0.622073
PIT07    0.623502
ATL01    0.624376
HOU03    0.625076
SEA03    0.625737
PHO01    0.626627
STP01    0.627982
NYC16    0.628152
CHI11    0.631758
MIL05    0.631929
LOS02    0.632082
ANA01    0.632129
CHI10    0.632449
MIN02    0.633666
NYC21    0.634583
ARL01    0.635176
BOS08    0.637754
DET

In [19]:
df6 = main_data.groupby(['ParkID', 'spot', 'home'])['Win'].agg(['mean', 'count'])
print(df6[df6['count'] >= 300].sort_values(['mean']).to_string())

                      mean  count
ParkID spot home                 
MIA02  9.0  1.0   0.202144    653
SAN02  9.0  1.0   0.202483   1289
PIT08  9.0  1.0    0.21014   1499
MIA01  9.0  1.0   0.216876   1434
ATL02  9.0  0.0   0.230576   1596
WAS10  9.0  1.0   0.230932    944
PHI13  9.0  1.0   0.231052   1359
SAN01  9.0  1.0   0.232095   2611
SFO03  9.0  1.0   0.233904   1693
MIL06  9.0  1.0   0.234463   1770
WAS11  9.0  0.0   0.236493    981
ATL02  9.0  1.0   0.236859   1579
PHI13  9.0  0.0   0.238129   1390
NYC20  9.0  0.0   0.238716    997
CIN08  9.0  1.0   0.240664   2410
SAN02  9.0  0.0   0.242193   1313
MON02  9.0  1.0   0.243137   2040
NYC20  9.0  1.0   0.244054    967
SFO03  9.0  0.0   0.247086   1716
MIA02  9.0  0.0   0.247734    662
MIA01  9.0  0.0    0.24983   1469
PHO01  9.0  0.0   0.251121   1784
STL10  9.0  0.0   0.251313   1142
LOS03  9.0  0.0   0.251667   5400
CIN09  9.0  1.0   0.252212   1356
WAS11  9.0  1.0   0.252826    973
PHI12  9.0  0.0   0.253626   2551
MIL06  9.0  0.