In [1]:
import pandas as pd
df=pd.read_csv("dailyActivity_merged.csv")

In [5]:
df.isnull().sum()

Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64

In [11]:
df.dtypes

Id                                   int64
ActivityDate                datetime64[ns]
TotalSteps                           int64
TotalDistance                      float64
TrackerDistance                    float64
LoggedActivitiesDistance           float64
VeryActiveDistance                 float64
ModeratelyActiveDistance           float64
LightActiveDistance                float64
SedentaryActiveDistance            float64
VeryActiveMinutes                    int64
FairlyActiveMinutes                  int64
LightlyActiveMinutes                 int64
SedentaryMinutes                     int64
Calories                             int64
dtype: object

In [9]:
df['ActivityDate']=pd.to_datetime(df['ActivityDate'])

In [10]:
df['ActivityDate'].head

<bound method NDFrame.head of 0     2016-04-12
1     2016-04-13
2     2016-04-14
3     2016-04-15
4     2016-04-16
         ...    
935   2016-05-08
936   2016-05-09
937   2016-05-10
938   2016-05-11
939   2016-05-12
Name: ActivityDate, Length: 940, dtype: datetime64[ns]>

In [12]:
# Extract weekday name
df['DayOfWeek'] = df['ActivityDate'].dt.day_name()

# Step frequency (Steps per active minute)
df['StepFrequency'] = df['TotalSteps'] / (df['VeryActiveMinutes'] + df['FairlyActiveMinutes'] + df['LightlyActiveMinutes'] + 1)

# Rest period ratio (Sedentary Minutes / Total Minutes in a day)
df['RestPeriodRatio'] = df['SedentaryMinutes'] / 1440  # 1440 = total minutes in a day

# Check updated data
print(df[['ActivityDate', 'DayOfWeek', 'StepFrequency', 'RestPeriodRatio']].head())


  ActivityDate  DayOfWeek  StepFrequency  RestPeriodRatio
0   2016-04-12    Tuesday      35.863760         0.505556
1   2016-04-13  Wednesday      41.608527         0.538889
2   2016-04-14   Thursday      46.905830         0.845833
3   2016-04-15     Friday      35.758242         0.504167
4   2016-04-16   Saturday      47.272388         0.536806


In [13]:
from sklearn.preprocessing import MinMaxScaler

# Select columns to normalize
cols_to_normalize = ['TotalSteps', 'TotalDistance', 'VeryActiveDistance', 
                      'ModeratelyActiveDistance', 'LightActiveDistance', 'Calories']

scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

# Check normalized values
print(df.head())


           Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366   2016-04-12    0.365418       0.303247             8.50   
1  1503960366   2016-04-13    0.298037       0.248662             6.97   
2  1503960366   2016-04-14    0.290402       0.240457             6.74   
3  1503960366   2016-04-15    0.271024       0.224046             6.28   
4  1503960366   2016-04-16    0.351731       0.291117             8.16   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0            0.085766                  0.084877   
1                       0.0            0.071624                  0.106481   
2                       0.0            0.111314                  0.061728   
3                       0.0            0.097628                  0.194444   
4                       0.0            0.123631                  0.063272   

   LightActiveDistance  SedentaryActiveDistance  VeryActiveMinutes  \
0             0.565826

In [24]:
# Drop unnecessary columns
df_model = df
df_model.columns


Index(['TotalSteps', 'TotalDistance', 'TrackerDistance', 'VeryActiveDistance',
       'ModeratelyActiveDistance', 'LightActiveDistance',
       'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes',
       'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories', 'StepFrequency',
       'RestPeriodRatio'],
      dtype='object')

In [25]:
X = df_model.drop(columns=['Calories'])  
y = df_model['Calories']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train Shape: {X_train.shape}, Test Shape: {X_test.shape}')


Train Shape: (752, 13), Test Shape: (188, 13)


In [26]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [27]:
# XGBoost Regressor model initialize karo
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Model ko train karo
xgb_model.fit(X_train, y_train)


In [33]:
# Predictions lo
y_pred = xgb_model.predict(X_test)

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Mean Absolute Error: {mae}")
print(f"📊 Mean Squared Error: {mse}")
print(f"📊 R² Score: {r2}")


📊 Mean Absolute Error: 0.05788841656496149
📊 Mean Squared Error: 0.005993379181182654
📊 R² Score: 0.6955596338656815


LSTM IMPLEMENTATION

In [34]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Features aur Target define karein
X = df[['TotalSteps', 'TotalDistance', 'VeryActiveDistance', 'LightActiveDistance', 'VeryActiveMinutes', 'SedentaryMinutes']]
y = df['Calories']

# Normalize data (LSTM ko scale values chahiye)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

# Convert data to 3D format (samples, time-steps, features)
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_scaled, test_size=0.2, random_state=42)


In [39]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import joblib

# 📌 Rebuild the model
model = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(1, X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='relu', return_sequences=False),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])

# ✅ FIXED: Use proper loss function
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.MeanAbsoluteError()])

# ✅ Train the model again
history = model.fit(X_train, y_train, epochs=100, batch_size=8, validation_data=(X_test, y_test), verbose=1)

# ✅ Save the model again
model.save("lstm_calories_model.h5")
print("✅ Model Re-trained & Saved Successfully!")


  super().__init__(**kwargs)


Epoch 1/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 0.1373 - mean_absolute_error: 0.3212 - val_loss: 0.0130 - val_mean_absolute_error: 0.0916
Epoch 2/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0158 - mean_absolute_error: 0.0985 - val_loss: 0.0121 - val_mean_absolute_error: 0.0871
Epoch 3/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0133 - mean_absolute_error: 0.0921 - val_loss: 0.0110 - val_mean_absolute_error: 0.0870
Epoch 4/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0115 - mean_absolute_error: 0.0870 - val_loss: 0.0107 - val_mean_absolute_error: 0.0844
Epoch 5/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0122 - mean_absolute_error: 0.0886 - val_loss: 0.0110 - val_mean_absolute_error: 0.0846
Epoch 6/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step



✅ Model Re-trained & Saved Successfully!
