In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

data = pd.read_csv("calories.csv")

# Check column names
print("Columns in CSV:", data.columns.tolist())
print(data.head())

target_col = [col for col in data.columns if 'calories' in col.lower()][0]

X = data.drop(target_col, axis=1)
y = data[target_col]

# Detect categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Test RMSE: {rmse:.2f}")

# Prepare new input for prediction
new_data_dict = {}
for col in X.columns:
    if 'age' in col.lower():
        new_data_dict[col] = 25
    elif 'weight' in col.lower():
        new_data_dict[col] = 70
    elif 'height' in col.lower():
        new_data_dict[col] = 175
    elif 'duration' in col.lower():
        new_data_dict[col] = 60
    elif 'heart' in col.lower():
        new_data_dict[col] = 130
    elif 'activity' in col.lower():

        new_data_dict[col] = 'Brisk Walking' if 'Brisk Walking' in X[col].unique() else X[col].mode()[0]
    else:
        # For other columns, use mean for numeric or mode for categorical
        if X[col].dtype == 'object':
            new_data_dict[col] = X[col].mode()[0]
        else:
            new_data_dict[col] = X[col].mean()

new_data = pd.DataFrame([new_data_dict])

# Predict calories burned
predicted_calories = model.predict(new_data)
print(f"Predicted Calories Burned: {predicted_calories[0]:.2f}")


Columns in CSV: ['User_ID', 'Gender', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
    User_ID  Gender  Age  Height  Weight  Duration  Heart_Rate  Body_Temp  \
0  14733363    male   68   190.0    94.0      29.0       105.0       40.8   
1  14861698  female   20   166.0    60.0      14.0        94.0       40.3   
2  11179863    male   69   179.0    79.0       5.0        88.0       38.7   
3  16180408  female   34   179.0    71.0      13.0       100.0       40.5   
4  17771927  female   27   154.0    58.0      10.0        81.0       39.8   

   Calories  
0     231.0  
1      66.0  
2      26.0  
3      71.0  
4      35.0  
Test RMSE: 11.49
Predicted Calories Burned: 443.12
