In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Set working directory to the project root (1 folder up)
import os
os.chdir('..')

In [None]:
# Load the data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

print(df_train.shape)
print(df_test.shape)

# Explore the data
df_train.head()

In [None]:
# Show overview of missing values
print(df_train.isnull().sum())

print(df_test.isnull().sum())

# No missing data


In [38]:
# Transform Sex to 0 and 1
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

# Transform target (Calories) with log1p
df_train['Calories'] = np.log1p(df_train['Calories'])


In [None]:
# Show column statistics 
print(df_train.describe())


In [None]:
# Plot histograms for all numeric features
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.ravel()

numeric_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']

for idx, col in enumerate(numeric_cols):
    sns.histplot(data=df_train, x=col, ax=axes[idx], bins=50)
    axes[idx].set_title(f'Distribution of {col}')

# Remove the empty subplot
axes[-1].remove()
axes[-2].remove()

plt.tight_layout()



In [None]:
# Make a correlation matrix  all features
corr_matrix = df_train.corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Features')
plt.show()



In [None]:
# Plot feature interactions using scatterplots
fig, axes = plt.subplots(4, 3, figsize=(15, 20))
axes = axes.ravel()

# Define pairs to plot
pairs = [
    ('Age', 'Duration'),
    ('Age', 'Heart_Rate'), 
    ('Age', 'Body_Temp'),
    ('Age', 'Calories'),
    ('Duration', 'Heart_Rate'),
    ('Duration', 'Body_Temp'), 
    ('Duration', 'Calories'),
    ('Heart_Rate', 'Body_Temp'),
    ('Heart_Rate', 'Calories'),
    ('Body_Temp', 'Calories')
]

# Create scatterplots
for idx, (x, y) in enumerate(pairs):
    sns.scatterplot(data=df_train, x=x, y=y, ax=axes[idx])
    axes[idx].set_title(f'{x} vs {y}')

# Remove empty subplots
axes[-1].remove()
axes[-2].remove()

plt.tight_layout()





In [None]:
# Implement basic lightgbm model on the training data
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Transform Sex to 0 and 1
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

# Transform target (Calories) with log1p
df_train['Calories'] = np.log1p(df_train['Calories'])

# Create column interactions between all numerical columns
numeric_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

# Create interaction features
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        col1, col2 = numeric_cols[i], numeric_cols[j]
        interaction_name = f'{col1}_{col2}_interaction'
        df_train[interaction_name] = df_train[col1] * df_train[col2]
        df_test[interaction_name] = df_test[col1] * df_test[col2]

# Prepare features and target
X = df_train.drop(['Calories', 'id'], axis=1)
y = df_train['Calories']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 1024,
    'max_bin': 1024,
    'learning_rate': 0.02,
    'subsample': 0.8,
    'n_estimators': 1000,
    'verbose': 1
}

model = LGBMRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# Make predictions
val_preds = model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f'Validation RMSE: {rmse}')


In [None]:
# Save the 'id' column before dropping it
test_ids = df_test['id']

# Drop 'id' column before prediction
df_test = df_test.drop('id', axis=1)

# Make predictions on test data
test_preds = model.predict(df_test)

# Transform test predictions with expm1
test_preds = np.expm1(test_preds)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'Calories': test_preds
})

# Save submission file
submission.to_csv('submissions/submission_1.csv', index=False)