In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv')
print(train_df.head())
print(train_df.info())
print(train_df.describe(include='all'))


   Unique ID  Rider_ID category_x  Circuit_Length_km  Laps  Grid_Position  \
0    1894944      2659      Moto2              4.874    22             17   
1      23438      5205      Moto2              3.875    24              7   
2     939678      7392      Moto3              5.647    25              5   
3    1196312      7894      Moto3              4.810    19              3   
4    1033899      6163     MotoGP              5.809    25             21   

   Avg_Speed_kmh Track_Condition  Humidity_% Tire_Compound_Front  ... air  \
0         264.66             Wet          61                Hard  ...  23   
1         177.56             Wet          77                Soft  ...  12   
2         317.74             Dry          87                Soft  ...  22   
3         321.82             Wet          43                Soft  ...  23   
4         239.92             Wet          47                Hard  ...  22   

  ground  starts  finishes with_points  podiums  wins  min_year  max_year 

In [2]:
print(train_df.isna().sum())


Unique ID                               0
Rider_ID                                0
category_x                              0
Circuit_Length_km                       0
Laps                                    0
Grid_Position                           0
Avg_Speed_kmh                           0
Track_Condition                         0
Humidity_%                              0
Tire_Compound_Front                     0
Tire_Compound_Rear                      0
Penalty                            321292
Championship_Points                     0
Championship_Position                   0
Session                                 0
year_x                                  0
sequence                                0
rider                                   0
team                                    0
bike                                    0
position                                0
points                                  0
shortname                               0
circuit_name                      

In [3]:
print(train_df['Penalty'].dtype)
print(train_df['Penalty'].unique())


object
['+3s' '+5s' 'DNF' 'DNS' 'Ride Through' nan]


In [4]:
def map_penalty(x):
    if pd.isna(x):
        return 0
    if x == '+3s':
        return 3
    if x == '+5s':
        return 5
    if x == 'Ride Through':
        return 30  # example penalty seconds, adjust if you know
    if x == 'DNF':
        return 120  # big penalty, adjust as needed
    if x == 'DNS':
        return 120  # same as above
    return 0  # fallback

train_df['Penalty_seconds'] = train_df['Penalty'].apply(map_penalty)
print(train_df['Penalty_seconds'].value_counts())


Penalty_seconds
120    640053
0      321292
3      320314
30     316548
5      315849
Name: count, dtype: int64


In [5]:
train_df = train_df.drop(columns=['Penalty'])
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

for col in cat_cols:
    print(f"{col}: {train_df[col].nunique()}")


Categorical columns: ['category_x', 'Track_Condition', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Session', 'shortname', 'circuit_name', 'rider_name', 'team_name', 'bike_name', 'weather', 'track']
category_x: 3
Track_Condition: 2
Tire_Compound_Front: 3
Tire_Compound_Rear: 3
Session: 7
shortname: 53
circuit_name: 70
rider_name: 2695
team_name: 967
bike_name: 301
weather: 5
track: 2


In [6]:
import pandas as pd

# One-hot encoding columns
one_hot_cols = ['category_x', 'Track_Condition', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Session', 'weather', 'track']

# Frequency encoding columns
freq_encode_cols = ['shortname', 'circuit_name', 'rider_name', 'team_name', 'bike_name']

for col in freq_encode_cols:
    freq = train_df[col].value_counts(normalize=True)
    train_df[col + '_freq_enc'] = train_df[col].map(freq)
    # If test dataset is available, apply same mapping there

# One-hot encoding
train_df = pd.get_dummies(train_df, columns=one_hot_cols, drop_first=True)

# Drop original categorical cols after encoding frequency columns
train_df = train_df.drop(columns=freq_encode_cols)

print(train_df.head())



   Unique ID  Rider_ID  Circuit_Length_km  Laps  Grid_Position  Avg_Speed_kmh  \
0    1894944      2659              4.874    22             17         264.66   
1      23438      5205              3.875    24              7         177.56   
2     939678      7392              5.647    25              5         317.74   
3    1196312      7894              4.810    19              3         321.82   
4    1033899      6163              5.809    25             21         239.92   

   Humidity_%  Championship_Points  Championship_Position  year_x  ...  \
0          61                  100                     20    2021  ...   
1          77                  137                      7    1977  ...   
2          87                   43                     17    1985  ...   
3          43                  234                     13    2007  ...   
4          47                  306                      8    2018  ...   

   Session_FP3  Session_FP4  Session_Qualifying  Session_Race  Sessi

In [7]:
print("Missing values per column:")
print(train_df.isnull().sum()[train_df.isnull().sum() > 0])

# Separate features and target
X = train_df.drop(columns=['Lap_Time_Seconds'])
y = train_df['Lap_Time_Seconds']

print(f"X shape: {X.shape}, y shape: {y.shape}")


Missing values per column:
Series([], dtype: int64)
X shape: (1914056, 55), y shape: (1914056,)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")


Train shape: (1531244, 55), Validation shape: (382812, 55)


[0]	train-rmse:11.50726	eval-rmse:11.52086
[50]	train-rmse:10.75655	eval-rmse:10.79094
[100]	train-rmse:10.14289	eval-rmse:10.19488
[150]	train-rmse:9.55674	eval-rmse:9.62088
[200]	train-rmse:9.04597	eval-rmse:9.11835
[250]	train-rmse:8.54935	eval-rmse:8.63187
[300]	train-rmse:8.12052	eval-rmse:8.20803
[350]	train-rmse:7.72259	eval-rmse:7.81570
[400]	train-rmse:7.35204	eval-rmse:7.45254
[450]	train-rmse:7.03934	eval-rmse:7.13992
[500]	train-rmse:6.69205	eval-rmse:6.79669
[550]	train-rmse:6.37994	eval-rmse:6.48851
[600]	train-rmse:6.08161	eval-rmse:6.19169
[650]	train-rmse:5.80658	eval-rmse:5.92007
[700]	train-rmse:5.54479	eval-rmse:5.65944
[750]	train-rmse:5.30493	eval-rmse:5.41908
[800]	train-rmse:5.07654	eval-rmse:5.19239
[850]	train-rmse:4.85400	eval-rmse:4.97032
[900]	train-rmse:4.64205	eval-rmse:4.75756
[950]	train-rmse:4.46236	eval-rmse:4.57766
[999]	train-rmse:4.26923	eval-rmse:4.38383
Validation RMSE: 4.383828145927873


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna

# Load data
train_df = pd.read_csv('train.csv')

# Map Penalty to seconds
def map_penalty(x):
    if pd.isna(x):
        return 0
    if x == '+3s':
        return 3
    if x == '+5s':
        return 5
    if x == 'Ride Through':
        return 30
    if x == 'DNF':
        return 120
    if x == 'DNS':
        return 120
    return 0

train_df['Penalty_seconds'] = train_df['Penalty'].apply(map_penalty)
train_df = train_df.drop(columns=['Penalty'])

# Frequency encode some categorical cols
freq_encode_cols = ['shortname', 'circuit_name', 'rider_name', 'team_name', 'bike_name']
for col in freq_encode_cols:
    freq = train_df[col].value_counts(normalize=True)
    train_df[col + '_freq_enc'] = train_df[col].map(freq)

# One-hot encode others
one_hot_cols = ['category_x', 'Track_Condition', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Session', 'weather', 'track']
train_df = pd.get_dummies(train_df, columns=one_hot_cols, drop_first=True)

# Drop original freq-encoded columns
train_df = train_df.drop(columns=freq_encode_cols)

# Features and target
X = train_df.drop(columns=['Lap_Time_Seconds'])
y = train_df['Lap_Time_Seconds']

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Objective function for Optuna
def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'seed': 42,
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'eta': trial.suggest_float('eta', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'verbosity': 0,
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=1000,
        early_stopping_rounds=20,
        evals=[(dtrain, 'train'), (dval, 'eval')],
        verbose_eval=False
    )

    preds = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

# Create study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Best hyperparameters:", study.best_trial.params)
print(f"Best validation RMSE: {study.best_value}")


[I 2025-06-14 13:24:39,858] A new study created in memory with name: no-name-b72e5f25-aa94-470f-aa74-ff6d959fef88
[I 2025-06-14 13:25:49,300] Trial 0 finished with value: 0.28371409710662565 and parameters: {'max_depth': 9, 'eta': 0.2842813616941555, 'subsample': 0.9291285763905526, 'colsample_bytree': 0.5531404866256848, 'min_child_weight': 3, 'gamma': 0.345066281227967}. Best is trial 0 with value: 0.28371409710662565.
[I 2025-06-14 13:27:00,667] Trial 1 finished with value: 8.355050172879432 and parameters: {'max_depth': 8, 'eta': 0.02835053422276664, 'subsample': 0.7275319510621503, 'colsample_bytree': 0.5708619266465813, 'min_child_weight': 1, 'gamma': 4.182907434568077}. Best is trial 0 with value: 0.28371409710662565.
[I 2025-06-14 13:27:56,972] Trial 2 finished with value: 7.30903041413028 and parameters: {'max_depth': 6, 'eta': 0.13469766244426018, 'subsample': 0.5146142881369156, 'colsample_bytree': 0.6444322272645304, 'min_child_weight': 3, 'gamma': 0.4856035753993815}. Best

Best hyperparameters: {'max_depth': 9, 'eta': 0.2842813616941555, 'subsample': 0.9291285763905526, 'colsample_bytree': 0.5531404866256848, 'min_child_weight': 3, 'gamma': 0.345066281227967}
Best validation RMSE: 0.28371409710662565


In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb

# Reload train and test
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# --- 1. Preprocessing (same as training pipeline) ---

def map_penalty(x):
    if pd.isna(x):
        return 0
    if x == '+3s':
        return 3
    if x == '+5s':
        return 5
    if x == 'Ride Through':
        return 30
    if x == 'DNF':
        return 120
    if x == 'DNS':
        return 120
    return 0

# Penalty
train_df['Penalty_seconds'] = train_df['Penalty'].apply(map_penalty)
test_df['Penalty_seconds'] = test_df['Penalty'].apply(map_penalty)
train_df = train_df.drop(columns=['Penalty'])
test_df = test_df.drop(columns=['Penalty'])

# Frequency encode
freq_encode_cols = ['shortname', 'circuit_name', 'rider_name', 'team_name', 'bike_name']
for col in freq_encode_cols:
    freq = train_df[col].value_counts(normalize=True)
    train_df[col + '_freq_enc'] = train_df[col].map(freq)
    test_df[col + '_freq_enc'] = test_df[col].map(freq).fillna(0)

train_df = train_df.drop(columns=freq_encode_cols)
test_df = test_df.drop(columns=freq_encode_cols)

# One-hot encode
one_hot_cols = ['category_x', 'Track_Condition', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Session', 'weather', 'track']
train_df = pd.get_dummies(train_df, columns=one_hot_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=one_hot_cols, drop_first=True)

# Match test columns to train
missing_cols = set(train_df.columns) - set(test_df.columns) - {'Lap_Time_Seconds'}
for col in missing_cols:
    test_df[col] = 0
test_df = test_df[[c for c in train_df.columns if c != 'Lap_Time_Seconds']]

# --- 2. Final Train/Test Split ---

X_full = train_df.drop(columns=['Lap_Time_Seconds'])
y_full = train_df['Lap_Time_Seconds']
X_test = test_df

# --- 3. Train Final Model ---

final_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'seed': 42,
    'max_depth': 9,
    'eta': 0.2842813616941555,
    'subsample': 0.9291285763905526,
    'colsample_bytree': 0.5531404866256848,
    'min_child_weight': 3,
    'gamma': 0.345066281227967,
    'verbosity': 0
}

dfull = xgb.DMatrix(X_full, label=y_full)
dtest = xgb.DMatrix(X_test)

final_model = xgb.train(final_params, dfull, num_boost_round=1000)

# --- 4. Predict and Save ---
test_preds = final_model.predict(dtest)

submission = pd.DataFrame({
    "Unique ID": test_df["Unique ID"],
    "Lap_Time_Seconds": test_preds
})

submission.to_csv("solution4.csv", index=False)
print("✅ solution4.csv created successfully!")


✅ solution4.csv created successfully!
