In [1]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
import os
import glob

def find_csv_files(root_dir):
    # This will hold all the paths to the files
    csv_files = []

    # Walk through all the directories and subdirectories
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Use glob to find files ending with 'test.csv' in the current directory
        for file in glob.glob(os.path.join(dirpath, '*test.csv')):
            csv_files.append(file)
    
    return csv_files

# Replace 'your_directory_path' with the path to your directory
root_directory = 'data/RaceMultiOutputModelRandomized/'
files = find_csv_files(root_directory)
print(files)

['data/RaceMultiOutputModelRandomized/position/Lap7/1copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap7/1copies_y_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap9/1copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap9/1copies_y_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap8/1copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap8/1copies_y_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap6/1copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap6/1copies_y_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap1/1copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap1/1copies_y_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap1/3copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Lap1/3copies_y_test.csv', 'data/RaceMultiOutputModelRandomized/position/Laps0to10/3copies_X_test.csv', 'data/RaceMultiOutputModelRandomized/position/Laps0to10/3c

In [3]:
race_ids = pd.DataFrame()
for file in files:
    if '1copies' in file:
        race_ids[file] = pd.read_csv(file)['copy_id']

In [4]:
def model(X):
    return np.argmin(X.values, axis=1)

def process_Y(y):
    return np.argmin(y.values, axis=1)

# Baseline Model -- argmin lap n
Use positions at nth lap as prediction of finishing position (predicting first place only)

In [5]:
def calculate_fp_accuracy(X, y):
    y_pred = model(X)
    y_true = process_Y(y)
    return np.mean(y_pred == y_true)

In [6]:
fp_accuracy = pd.DataFrame(columns=['lap','train','valid','test'])
for lap_data in ['Lap{}'.format(i) for i in range(1,11)]:

    X_train = pd.read_csv('data/RaceMultiOutputModelRandomized/position/{}/1copies_X_train.csv'.format(lap_data)).drop(columns=['copy_id'],axis=1)
    y_train = pd.read_csv('data/RaceMultiOutputModelRandomized/position/{}/1copies_y_train.csv'.format(lap_data)).drop(columns=['copy_id'],axis=1)
    X_valid = pd.read_csv('data/RaceMultiOutputModelRandomized/position/{}/1copies_X_valid.csv'.format(lap_data)).drop(columns=['copy_id'],axis=1)
    y_valid = pd.read_csv('data/RaceMultiOutputModelRandomized/position/{}/1copies_y_valid.csv'.format(lap_data)).drop(columns=['copy_id'],axis=1)
    X_test = pd.read_csv('data/RaceMultiOutputModelRandomized/position/{}/1copies_X_test.csv'.format(lap_data)).drop(columns=['copy_id'],axis=1)
    y_test = pd.read_csv('data/RaceMultiOutputModelRandomized/position/{}/1copies_y_test.csv'.format(lap_data)).drop(columns=['copy_id'],axis=1)

    train_accuracy = calculate_fp_accuracy(X_train, y_train)
    valid_accuracy = calculate_fp_accuracy(X_valid, y_valid)
    test_accuracy = calculate_fp_accuracy(X_test, y_test)
    
    fp_accuracy.loc[len(fp_accuracy)] = [lap_data, train_accuracy, valid_accuracy, test_accuracy]
fp_accuracy

Unnamed: 0,lap,train,valid,test
0,Lap1,0.563725,0.470588,0.509804
1,Lap2,0.552826,0.54902,0.54902
2,Lap3,0.555283,0.568627,0.568627
3,Lap4,0.55774,0.568627,0.588235
4,Lap5,0.577396,0.568627,0.607843
5,Lap6,0.577396,0.588235,0.607843
6,Lap7,0.570025,0.588235,0.607843
7,Lap8,0.567568,0.588235,0.607843
8,Lap9,0.574939,0.568627,0.607843
9,Lap10,0.58231,0.588235,0.588235


# Other

In [66]:
def top_n_accuracy(y_true, y_pred, n=3):
    """
    Calculate the Top-N accuracy of the predictions.
    """

    array = []


    zero_off = 0
    one_off = 0
    two_off = 0

    for i, race in enumerate(y_true):
        race_1st = np.argpartition(race, 0)[0] #race.argsort()[0]##np.argpartition(race, 0)[0]
        #race_1st = race.argsort().argsort()[0]
        #assert race_1st==race_1st2
        #array.append(race_1st)
        race_2nd = np.argpartition(race, 1)[1]
        race_3rd = np.argpartition(race, 2)[2]

        pred = y_pred[i]
        if pred == race_1st:
            zero_off +=1
        elif pred == race_2nd:
            one_off += 1
        elif pred == race_3rd:
            two_off +=1

    print(zero_off, one_off, two_off)

    zero_off_acc = zero_off/len(y_pred)
    one_off_acc = (zero_off + one_off)/len(y_pred)
    two_off_acc = (zero_off + one_off + two_off)/len(y_pred)

    return zero_off_acc, one_off_acc, two_off_acc

In [67]:
top_n_accuracy(y_train.values, preds, n=3)

235 67 20


(0.5773955773955773, 0.742014742014742, 0.7911547911547911)