In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
train = pd.read_csv('/Users/shikhargoyal/Documents/Kaggle/playground-series-s5e5/train.csv')
test = pd.read_csv('/Users/shikhargoyal/Documents/Kaggle/playground-series-s5e5/test.csv')

In [5]:
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [6]:
train.isnull().sum()/len(train)*100

id            0.0
Sex           0.0
Age           0.0
Height        0.0
Weight        0.0
Duration      0.0
Heart_Rate    0.0
Body_Temp     0.0
Calories      0.0
dtype: float64

In [13]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

# ------------------------
# Step 1: Separate features and target
# ------------------------
X = train.drop(columns=["Calories"])  # Adjust if needed
y = train["Calories"]

# ------------------------
# Step 2: Identify column types
# ------------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# ------------------------
# Step 3: Preprocessing pipeline
# ------------------------
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# ------------------------
# Step 4: Full pipeline
# ------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# ------------------------
# Step 5: Hyperparameter distributions
# ------------------------
param_dist = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__max_depth': randint(5, 20),
    'regressor__min_samples_split': randint(2, 10),
    'regressor__min_samples_leaf': randint(1, 5)
}

# ------------------------
# Step 6: RandomizedSearchCV
# ------------------------
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# ------------------------
# Step 7: Fit model
# ------------------------
random_search.fit(X, y)

# ------------------------
# Step 8: Results
# ------------------------
print("✅ Best Params:", random_search.best_params_)
print("🏆 Best RMSE (CV):", -random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=114; total time= 5.0min
[CV] END regressor__max_depth=15, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=120; total time=40.9min
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=114; total time= 5.0min
[CV] END regressor__max_depth=15, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=120; total time=40.9min
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=114; total time= 5.1min
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=2, regressor__min_samples_split=4, regressor__n_estimators=174; total time=54.8min
[CV] END regressor__max_depth=15, regressor__min_samples_leaf=4, reg

In [8]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, randint

# ------------------------
# Step 1: Separate features and target
# ------------------------
X = train.drop(columns=["Calories"])  # Adjust column name if needed
y = train["Calories"]

# ------------------------
# Step 2: Identify column types
# ------------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# ------------------------
# Step 3: Preprocessing pipeline
# ------------------------
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# ------------------------
# Step 4: Full pipeline
# ------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# ------------------------
# Step 5: Hyperparameter distributions for RandomizedSearchCV
# ------------------------
param_dist = {
    'regressor__n_estimators': randint(100, 300),
    'regressor__learning_rate': uniform(0.01, 0.2),
    'regressor__max_depth': randint(3, 8),
    'regressor__subsample': uniform(0.7, 0.3)
}

# ------------------------
# Step 6: RandomizedSearchCV
# ------------------------
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,  # You can reduce to 10 for even faster tuning
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# ------------------------
# Step 7: Fit model
# ------------------------
random_search.fit(X, y)

# ------------------------
# Step 8: Results
# ------------------------
print("✅ Best Params:", random_search.best_params_)
print("🏆 Best RMSE (CV):", -random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END regressor__learning_rate=0.0849080237694725, regressor__max_depth=7, regressor__n_estimators=114, regressor__subsample=0.9195981825434215; total time= 6.2min
[CV] END regressor__learning_rate=0.15439975445336496, regressor__max_depth=4, regressor__n_estimators=291, regressor__subsample=0.9976634677873653; total time=10.0min
[CV] END regressor__learning_rate=0.13349630192554332, regressor__max_depth=4, regressor__n_estimators=121, regressor__subsample=0.7021198915659151; total time= 3.0min
[CV] END regressor__learning_rate=0.014612485008283152, regressor__max_depth=5, regressor__n_estimators=158, regressor__subsample=0.8199582915145766; total time= 5.5min
[CV] END regressor__learning_rate=0.16703519227860275, regressor__max_depth=5, regressor__n_estimators=207, regressor__subsample=0.8542703315240834; total time= 7.6min
[CV] END regressor__learning_rate=0.1284829137724085, regressor__max_depth=5, regressor__n_estimat

In [10]:
test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [14]:
# ------------------------
# Step 9: Predict on test data
# ------------------------

# Assuming your test set is loaded as:
# test = pd.read_csv("test.csv")

# Save test IDs before dropping
test_ids = test["id"]

# Drop unnecessary columns if needed
#X_test = test.drop(columns=["id"])

# Apply the same transformations using the fitted pipeline
predictions = random_search.best_estimator_.predict(test)

# ------------------------
# Step 10: Save predictions to CSV
# ------------------------
output_df = pd.DataFrame({
    "id": test_ids,
    "Calories": predictions
})

# Save to CSV
output_df.to_csv("/Users/shikhargoyal/Documents/Kaggle/playground-series-s5e5/submissionrr.csv", index=False)
print("✅ Predictions saved to submission.csv")


✅ Predictions saved to submission.csv
