In [1]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [2]:
# prompt: connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#  Check TensorFlow version
print(tf.__version__)

# Check available devices
print("Available devices:", tf.config.list_physical_devices())

# Check if GPU is available
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

# with tf.device('/gpu:0')

2.14.0
Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
Please install GPU version of TF


## Read all the data

In [4]:
def load_kaggle_data(path):
    data = pd.read_csv(path)
    data.replace({'\\N':np.nan}, inplace=True)
    return data

data_dir ='/content/drive/MyDrive/f1db_csv'

circuits_raw_df = load_kaggle_data(f'{data_dir}/circuits.csv')
drivers_raw_df = load_kaggle_data(f'{data_dir}/drivers.csv')
laptimes_raw_df = load_kaggle_data(f'{data_dir}/lap_times.csv')
pitstops_raw_df = load_kaggle_data(f'{data_dir}/pit_stops.csv')
results_raw_df = load_kaggle_data(f'{data_dir}/results.csv')

print('circuits_raw_df', circuits_raw_df.columns)
print('drivers_raw_df', drivers_raw_df.columns)
print('laptimes_raw_df', laptimes_raw_df.columns)
print('pitstops_raw_df', pitstops_raw_df.columns)
print('results_raw_df', results_raw_df.columns)

circuits_raw_df Index(['circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng',
       'alt', 'url'],
      dtype='object')
drivers_raw_df Index(['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob',
       'nationality', 'url'],
      dtype='object')
laptimes_raw_df Index(['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds'], dtype='object')
pitstops_raw_df Index(['raceId', 'driverId', 'stop', 'lap', 'time', 'duration',
       'milliseconds'],
      dtype='object')
results_raw_df Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId'],
      dtype='object')


In [5]:
import pandas as pd

# Assuming you have your DataFrames: laptimes_raw_df and pitstops_raw_df

# Merge DataFrames on 'raceId' and 'driverId' with suffixes
df_test = pd.merge(laptimes_raw_df, pitstops_raw_df, on=['raceId', 'driverId'], how='inner', suffixes=('_laptimes', '_pitstops'))

# Display the result
print(df_test)


        raceId  driverId  lap_laptimes  position time_laptimes  \
0          841        20             1         1      1:38.109   
1          841        20             1         1      1:38.109   
2          841        20             2         1      1:33.006   
3          841        20             2         1      1:33.006   
4          841        20             3         1      1:32.713   
...        ...       ...           ...       ...           ...   
593876    1114       822             5        20      2:02.755   
593877    1114       822             6        19      1:46.852   
593878    1114       822             6        19      1:46.852   
593879    1114       822             7        20      2:38.277   
593880    1114       822             7        20      2:38.277   

        milliseconds_laptimes  stop  lap_pitstops time_pitstops duration  \
0                       98109     1            14      17:25:17   22.603   
1                       98109     2            36      

In [6]:
df_test = pd.merge(df_test, drivers_raw_df, on='driverId', how='left')

In [7]:
df_test = df_test.drop(['url','driverRef','forename','surname','nationality','dob','time_laptimes','time_pitstops'],axis=1)

In [8]:
df_test.columns

Index(['raceId', 'driverId', 'lap_laptimes', 'position',
       'milliseconds_laptimes', 'stop', 'lap_pitstops', 'duration',
       'milliseconds_pitstops', 'number', 'code'],
      dtype='object')

In [9]:
df_test.shape

(593881, 11)

In [10]:
# Get driver name data
drivers = drivers_raw_df[['driverId','forename','surname','number']].reset_index(drop=True)
drivers['Name'] = drivers[['forename', 'surname']].agg(' '.join, axis=1)
drivers.drop(['forename','surname'], axis=1, inplace=True)

driver_name_id_map = dict(zip(drivers['Name'], drivers['driverId']))
def get_driverID(name):

    return drivers.loc[drivers['Name']==name, 'driverId'].squeeze()

driver_name_number_map = dict(zip(drivers['Name'], drivers['number']))
def get_driver_number(name):

    return drivers.loc[drivers['Name']==name, 'number'].squeeze()


#drivers[drivers['Name'].str.contains('Nyck')]

driver_names_of_interest = ['Max Verstappen',
                           'Sergio Pérez',
                           'Lewis Hamilton',
                           'Fernando Alonso',
                           'Charles Leclerc',
                           'Lando Norris',
                           'George Russell',
                           'Oscar Piastri',
                           'Lance Stroll',
                           'Pierre Gasly',
                           'Esteban Ocon',
                           'Alexander Albon',
                           'Nico Hülkenberg',
                           'Valtteri Bottas',
                           'Guanyu Zhou',
                           'Yuki Tsunoda',
                           'Kevin Magnussen',
                           'Logan Sargeant',
                           'Nyck de Vries',
                           'Daniel Ricciardo']
driver_ids_of_interest = [driver_name_id_map[name] for name in driver_names_of_interest]
driver_numbers_of_interest = [driver_name_number_map[name] for name in driver_names_of_interest]

In [11]:
driver_ids_of_interest

[830,
 815,
 1,
 4,
 844,
 846,
 847,
 857,
 840,
 842,
 839,
 848,
 807,
 822,
 855,
 852,
 825,
 858,
 856,
 817]

In [12]:
df_test = df_test[df_test['driverId'].isin(driver_ids_of_interest)]

In [13]:
df_test

Unnamed: 0,raceId,driverId,lap_laptimes,position,milliseconds_laptimes,stop,lap_pitstops,duration,milliseconds_pitstops,number,code
116,841,1,1,2,100573,1,16,23.227,23227,44,HAM
117,841,1,1,2,100573,2,36,23.199,23199,44,HAM
118,841,1,2,2,93774,1,16,23.227,23227,44,HAM
119,841,1,2,2,93774,2,36,23.199,23199,44,HAM
120,841,1,3,2,92900,1,16,23.227,23227,44,HAM
...,...,...,...,...,...,...,...,...,...,...,...
593876,1114,822,5,20,122755,2,6,53.512,53512,77,BOT
593877,1114,822,6,19,106852,1,1,55.802,55802,77,BOT
593878,1114,822,6,19,106852,2,6,53.512,53512,77,BOT
593879,1114,822,7,20,158277,1,1,55.802,55802,77,BOT


In [14]:
df_test['code'].unique()

array(['HAM', 'ALO', 'PER', 'RIC', 'HUL', 'BOT', 'MAG', 'VER', 'OCO',
       'STR', 'GAS', 'LEC', 'NOR', 'ALB', 'RUS', 'TSU', 'ZHO', 'DEV',
       'SAR', 'PIA'], dtype=object)

In [15]:
df_test.dtypes

raceId                    int64
driverId                  int64
lap_laptimes              int64
position                  int64
milliseconds_laptimes     int64
stop                      int64
lap_pitstops              int64
duration                 object
milliseconds_pitstops     int64
number                   object
code                     object
dtype: object

In [16]:
df_test

Unnamed: 0,raceId,driverId,lap_laptimes,position,milliseconds_laptimes,stop,lap_pitstops,duration,milliseconds_pitstops,number,code
116,841,1,1,2,100573,1,16,23.227,23227,44,HAM
117,841,1,1,2,100573,2,36,23.199,23199,44,HAM
118,841,1,2,2,93774,1,16,23.227,23227,44,HAM
119,841,1,2,2,93774,2,36,23.199,23199,44,HAM
120,841,1,3,2,92900,1,16,23.227,23227,44,HAM
...,...,...,...,...,...,...,...,...,...,...,...
593876,1114,822,5,20,122755,2,6,53.512,53512,77,BOT
593877,1114,822,6,19,106852,1,1,55.802,55802,77,BOT
593878,1114,822,6,19,106852,2,6,53.512,53512,77,BOT
593879,1114,822,7,20,158277,1,1,55.802,55802,77,BOT


In [17]:
# Assuming df is your DataFrame
# Find rows where 'duration' contains ':'
mask = df_test['duration'].str.contains(':')

# Invert the mask to get rows where ':' is NOT present
rows_to_keep = ~mask

# Keep only the rows where ':' is not present in the 'duration' column
df_test = df_test[rows_to_keep]

# Display the DataFrame without the rows containing ':'
# print(df_test)


In [18]:
# Assuming df is your DataFrame
# Rename the DataFrame to df_test
# df_test = df.copy()

# Convert the 'duration' column to numeric (float)
df_test['duration'] = pd.to_numeric(df_test['duration'], errors='coerce')

# Convert the 'duration' column to integers
df_test['duration'] = df_test['duration'].round().astype(int)

# Display the DataFrame after converting the 'duration' column to integers
# print(df_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['duration'] = pd.to_numeric(df_test['duration'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['duration'] = df_test['duration'].round().astype(int)


In [19]:
# Assuming df is your DataFrame
# Find the count of NaN values in each column
nan_count_per_column = df_test.isna().sum()

# Display the count of NaN values in each column
print("Count of NaN values in each column:")
print(nan_count_per_column)


Count of NaN values in each column:
raceId                   0
driverId                 0
lap_laptimes             0
position                 0
milliseconds_laptimes    0
stop                     0
lap_pitstops             0
duration                 0
milliseconds_pitstops    0
number                   0
code                     0
dtype: int64


In [20]:
pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow)
  Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.16,>=2.15.0 (from tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
Installing collected pac

In [21]:
best_params = {'name': 'sequential_13', 'layers': [{'module': 'keras.layers', 'class_name': 'InputLayer', 'config': {'batch_input_shape': (None, None, 1), 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'lstm_11_input'}, 'registered_name': None}, {'module': 'keras.layers', 'class_name': 'LSTM', 'config': {'name': 'lstm_11', 'trainable': True, 'dtype': 'float32', 'batch_input_shape': (None, None, 1), 'return_sequences': False, 'return_state': False, 'go_backwards': False, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 50, 'activation': 'tanh', 'recurrent_activation': 'sigmoid', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'recurrent_initializer': {'module': 'keras.initializers', 'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.0, 'recurrent_dropout': 0.0, 'implementation': 2}, 'registered_name': None, 'build_config': {'input_shape': (None, None, 1)}}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_11', 'trainable': True, 'dtype': 'float32', 'units': 1, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': (None, 50)}}]}

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Extract all relevant parameters
input_shape = best_params['layers'][0]['config']['batch_input_shape'][1:]
units = best_params['layers'][1]['config']['units']
activation = best_params['layers'][1]['config']['activation']
recurrent_activation = best_params['layers'][1]['config']['recurrent_activation']
use_bias = best_params['layers'][1]['config']['use_bias']
kernel_initializer = best_params['layers'][1]['config']['kernel_initializer']
recurrent_initializer = best_params['layers'][1]['config']['recurrent_initializer']
bias_initializer = best_params['layers'][1]['config']['bias_initializer']
unit_forget_bias = best_params['layers'][1]['config']['unit_forget_bias']
kernel_regularizer = best_params['layers'][1]['config']['kernel_regularizer']
recurrent_regularizer = best_params['layers'][1]['config']['recurrent_regularizer']
bias_regularizer = best_params['layers'][1]['config']['bias_regularizer']
activity_regularizer = best_params['layers'][1]['config']['activity_regularizer']
kernel_constraint = best_params['layers'][1]['config']['kernel_constraint']
recurrent_constraint = best_params['layers'][1]['config']['recurrent_constraint']
bias_constraint = best_params['layers'][1]['config']['bias_constraint']
dropout = best_params['layers'][1]['config']['dropout']
recurrent_dropout = best_params['layers'][1]['config']['recurrent_dropout']
implementation = best_params['layers'][1]['config']['implementation']
learning_rate = 0.01

# Create an LSTM model with all hyperparameters
def create_lstm_model(input_shape=input_shape, units=units, activation=activation,
                      recurrent_activation=recurrent_activation, use_bias=use_bias,
                      kernel_initializer=kernel_initializer, recurrent_initializer=recurrent_initializer,
                      bias_initializer=bias_initializer, unit_forget_bias=unit_forget_bias,
                      kernel_regularizer=kernel_regularizer, recurrent_regularizer=recurrent_regularizer,
                      bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer,
                      kernel_constraint=kernel_constraint, recurrent_constraint=recurrent_constraint,
                      bias_constraint=bias_constraint, dropout=dropout, recurrent_dropout=recurrent_dropout,
                      implementation=implementation, learning_rate=learning_rate):
    model = Sequential()
    model.add(LSTM(units, input_shape=input_shape, activation=activation,
                   recurrent_activation=recurrent_activation, use_bias=use_bias,
                   kernel_initializer=kernel_initializer, recurrent_initializer=recurrent_initializer,
                   bias_initializer=bias_initializer, unit_forget_bias=unit_forget_bias,
                   kernel_regularizer=kernel_regularizer, recurrent_regularizer=recurrent_regularizer,
                   bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer,
                   kernel_constraint=kernel_constraint, recurrent_constraint=recurrent_constraint,
                   bias_constraint=bias_constraint, dropout=dropout, recurrent_dropout=recurrent_dropout,
                   implementation=implementation))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=Adam(learning_rate), loss='mean_squared_error')
    return model

# Create the best LSTM model
best_model = create_lstm_model()

# Print the model summary to check its architecture
best_model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 50)                10400     
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 10451 (40.82 KB)
Trainable params: 10451 (40.82 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
# Your code using StandardScaler goes here

# Assuming df is your combined dataframe with relevant features
# Adjust features as needed

# Assuming df is your DataFrame
# Replace this with loading your actual DataFrame or creating it

# Define features and target variable
X = df_test.drop(['lap_pitstops','code','number'], axis=1)
y = df_test['lap_pitstops']



# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize the data
scaler_pitstop = StandardScaler()
X_train_pitstop_scaled = scaler_pitstop.fit_transform(X_train)
X_test_pitstop_scaled = scaler_pitstop.transform(X_test)


In [None]:

# Train the best model on the entire dataset
best_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Print the Mean Squared Error on the test set
print("Mean Squared Error on Test Set:", mse)

Epoch 1/50


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
 817/6588 [==>...........................] - ETA: 37s - loss: 146.9957

In [None]:
# Evaluate the model
test_loss = best_model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")

Test Loss: 131.0720672607422


In [None]:
# Get predictions
y_pred = best_model.predict(X_test)

# y_pred is now an array of predictions corresponding to X_test


# Example: Print the first 10 actual vs predicted values
for i in range(10):
    print(f"Actual: {y_test.iloc[i]}, Predicted: {y_pred[i][0]}")


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

# These metrics give you a numerical indication of the model's performance.

Actual: 36, Predicted: 39.24711990356445
Actual: 12, Predicted: 21.084184646606445
Actual: 11, Predicted: 20.19255256652832
Actual: 13, Predicted: 23.302003860473633
Actual: 50, Predicted: 31.778783798217773
Actual: 2, Predicted: 17.640230178833008
Actual: 40, Predicted: 41.97040557861328
Actual: 18, Predicted: 17.840721130371094
Actual: 26, Predicted: 35.07601547241211
Actual: 19, Predicted: 17.279935836791992
Mean Absolute Error: 9.018721703594126
Mean Squared Error: 131.07206400268183


##ROUGH WORK:

In [None]:
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
# Your code using StandardScaler goes here

# Assuming df is your combined dataframe with relevant features
# Adjust features as needed

# Assuming df is your DataFrame
# Replace this with loading your actual DataFrame or creating it

# Define features and target variable
X = df_test.drop(['lap_pitstops','code','number'], axis=1)
y = df_test['lap_pitstops']



# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize the data
scaler_pitstop = StandardScaler()
X_train_pitstop_scaled = scaler_pitstop.fit_transform(X_train)
X_test_pitstop_scaled = scaler_pitstop.transform(X_test)


In [None]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(8, 1)))  # 50 units, input_shape = (time_steps, features)
model.add(Dense(1))  # Output layer with 1 unit for regression

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')