In [1]:
import numpy as np
import pandas as pd

### Download and Load the Raw Races Dataset

In [2]:
url = "http://personal.psu.edu/drh20/code/btmatlab/nascar2002.txt"
races_trimmed = pd.read_table(url, sep=" ")

### Create the Complete Dataset

- Most drivers do not race all of the races, 
- We need to create a dataset with every combination of all drivers/ races so that we can take pairwise differences consistently

In [4]:
# Create a cartesian product of unique drivers and races to get every combination
unq_drivers = sorted(races_trimmed["DriverID"].unique())
unq_drivers = [x for x in unq_drivers if str(x) != 'nan']
unq_races = sorted(races_trimmed["Race"].unique())
unq_races = [x for x in unq_races if str(x) != 'nan']

# Get a dataframe 
unq_drivers_df = pd.DataFrame(unq_drivers, columns=["DriverID"])
unq_races_df = pd.DataFrame(unq_races, columns=["Race"])

# Let's cross join the columns to get all unique combinations of drivers and races
all_driver_race_combs = unq_drivers_df.assign(foo=1).merge(unq_races_df.assign(foo=1)).drop('foo', 1)
all_driver_race_combs = all_driver_race_combs.sort_values(by=['Race', 'DriverID'])
all_driver_race_mg = pd.merge(all_driver_race_combs, races_trimmed,  how='left', 
                              left_on=['DriverID','Race'], right_on = ['DriverID','Race'])

### For a single race calculate the pairwise differences

In [42]:
def pawise_ind(val):
    if np.isnan(val):
        out = val
    else:
        out = (val < 0) * 1
    return out

def difference_matrix(a):
    """ Get the pairwise differences in all quantities in a 1D-numpy array
    Source: https://stackoverflow.com/questions/22863547/numpy-compute-all-possible-differences-in-an-array-at-fixed-distance"""
    x = np.reshape(a, (len(a), 1))
    return x - x.T

def get_single_race_pwise(race_num, src_dat):
    """Get all pairwise race comparisons for a single race"""
    
    race_res = all_driver_race_mg[all_driver_race_mg["Race"] == race_num]
    race_res = race_res.sort_values(by=['DriverID'])
    race_res_val = race_res.values[:, 2]
    
    vpawise_ind = np.vectorize(pawise_ind)
    pwise_diff = difference_matrix(race_res_val)
    #pwise_diff = vpawise_ind(pwise_diff)
    
    return pwise_diff

def get_all_pwise_comb(src_dat):
    unq_races = sorted(src_dat["Race"].unique())
    all_pwise_races = [get_single_race_pwise(race_num=race, src_dat=src_dat) for race in unq_races]
    return np.dstack(all_pwise_races)

## Get pairwise differences for all Drivers for a single race

In [49]:
race_16_diffs = get_single_race_pwise(race_num=16.0, src_dat=all_driver_race_mg)
race_16_diffs.shape

(83, 83)

## Get pairwise differences for all competitors and all races

In [47]:
all_pwise = get_all_pwise_comb(src_dat=all_driver_race_mg)
all_pwise.shape

(83, 83, 36)