# Calculate log loss offline



## Formula

LogLoss = -(1/n) * sum of 1..n [( y * log(yhat) + (1-y) * log(1-yhat)]


## Definitions
n is the number of games played

yhat is the predicted probabiltiy that team 1 beats team 2

y is 1 if team 1 wins, 0 if team 2 wins

log() is the natural (base e) logarithm

In [1]:
import numpy as np

def logloss(y,yhat):
    """
    Calculate the logloss of 1 prediction
    """
    return y*np.log(yhat) + (1-y)*np.log(1-yhat)


In [2]:
# Truer positives ( when we're more confident that we made the right prediction) should have lower log loss - Correct!
print logloss(1,0.5)
print logloss(1,0.983)

-0.6931471805599453
-0.017146158834970514


In [3]:
## Stage 1

import pandas as pd
sub=pd.read_csv('/Users/dtan/Code/kaggle-ncaa-madness-men/output/baseline-2018-03-11.csv')

In [4]:
sub.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.695652
1,2014_1107_1112,0.121212
2,2014_1107_1113,0.258065
3,2014_1107_1124,0.172043
4,2014_1107_1140,0.258065


In [5]:
results=pd.read_csv('/Users/dtan/Code/kaggle-ncaa-madness-men/data/stage1/DataFiles/NCAATourneyCompactResults.csv')

In [6]:
results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


## Test the accuracy of my logloss script by measuring logloss of Stage1 baseline prediction (1985-2013) on 2017's results

In [7]:
r17=results.query("Season=='2017'")
r17.head(3)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
2050,2017,134,1243,95,1448,88,N,0
2051,2017,134,1291,67,1309,66,N,0
2052,2017,135,1413,67,1300,63,N,0


In [38]:
## I wish i can find a .query(..) example that does this

s17=sub[sub['ID'].str.contains('2017_')]
s17.head(3)

Unnamed: 0,ID,Pred
6834,2017_1112_1116,0.660606
6835,2017_1112_1124,0.642857
6836,2017_1112_1137,0.807407


In [9]:
def parse_id1(string1):
    a,b,c=string1.split('_')
    return int(b)

def parse_id2(string1):
    a,b,c=string1.split('_')
    return int(c)

In [39]:
# Parse out wid and lid

s17=(s17
 .pipe(lambda x:x.assign(wid=x.ID.apply(parse_id1)))
 .pipe(lambda x:x.assign(lid=x.ID.apply(parse_id2)))
)
s17.head(3)

Unnamed: 0,ID,Pred,wid,lid
6834,2017_1112_1116,0.660606,1112,1116
6835,2017_1112_1124,0.642857,1112,1124
6836,2017_1112_1137,0.807407,1112,1137


In [12]:
def get_y(wid,lid):
    """
    Return 1 if wid won lid -> querying results df will return row of length 1
    Return 0 if wid did not win lid -> querying results df will return row of length 0
    Return -1 if the 2 teams never met
    """
    if len(r17.query("WTeamID=='{}' & LTeamID=='{}'".format(wid,lid)))==1:
        return 1
    elif len(r17.query("WTeamID=='{}' & LTeamID=='{}'".format(lid,wid)))==1:
        return 0
    else:
        return -1


In [40]:
r17.head(3)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
2050,2017,134,1243,95,1448,88,N,0
2051,2017,134,1291,67,1309,66,N,0
2052,2017,135,1413,67,1300,63,N,0


In [41]:
y=[]
for index,row in s17.iterrows():
    y.append(get_y(row['wid'],row['lid']))


In [42]:
s17['y']=y
s17=s17.drop(s17[s17.y<0].index) # Drop all -1

#Check that we have as many y as results
print s17.y.value_counts()
s17.head(3)

0    34
1    33
Name: y, dtype: int64


Unnamed: 0,ID,Pred,wid,lid,y
6869,2017_1112_1315,0.939655,1112,1315,1
6882,2017_1112_1388,0.797872,1112,1388,1
6900,2017_1112_1462,0.915254,1112,1462,0


## Calculating Total Log Loss

In [43]:
total_log_loss=0.0

for index,row in s17.iterrows():
    log_loss=logloss(row['y'],row['Pred'])
    total_log_loss+=log_loss

score=(-1.0/len(s17))*total_log_loss
print 'Final Log Loss Score={}'.format(score)

Final Log Loss Score=0.558911410387


In [None]:
## Looks accurate

## Calculating Log Loss of Baseline-2 model on SBNation's predictions