In [1]:
import sys
sys.path.append('d:\\ActivityP')

In [8]:
import Activity_pstructure

In [9]:
import sys
sys.path.append(r"d:\\ActivityP")
import Activity_pstructure

In [3]:
import pandas as pd
import sys
from pymongo.mongo_client  import MongoClient
from logging import getLogger
from Activity_pstructure.logging.logger import logger
from Activity_pstructure.exception.exception import ActivityException
from Activity_pstructure.entity.config_entity import MongoDBConfig

logger = getLogger("activity_logger")

class DataIngestion:
    def __init__(self, config: MongoDBConfig):
        self.config = config

    def fetch_data_from_mongodb(self) -> pd.DataFrame:
        try:
            logger.info("Starting data retrieval from MongoDB Atlas...")

            client = MongoClient(self.config.mongo_url)
            db = client[self.config.database_name]
            collection = db[self.config.collection_name]

            cursor = collection.find()
            data = list(cursor)

            for doc in data:  # Remove _id field if present
                doc.pop('_id', None)

            df = pd.DataFrame(data)
            logger.info(f"Retrieved {len(df)} records from MongoDB.")
            return df

        except Exception as e:
            raise ActivityException(e, sys)


In [4]:
import pandas as pd
import sys
import os
from pymongo.mongo_client  import MongoClient
from logging import getLogger
from Activity_pstructure.exception.exception import ActivityException
from Activity_pstructure.entity.config_entity import MongoDBConfig
from Activity_pstructure.constant.database import DATABASE_NAME, COLLECTION_NAME
from Activity_pstructure.components.ingestion import DataIngestion
MONGO_DB_URL = os.getenv("MONGO_DB_URL")

config = MongoDBConfig(
    mongo_url=MONGO_DB_URL,
    database_name=DATABASE_NAME,
    collection_name=COLLECTION_NAME
)

ingestion = DataIngestion(config)
df = ingestion.fetch_data_from_mongodb()
print(df.head())

           Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366    4/20/2016       10544           6.68             6.68   
1  1503960366    4/27/2016       18134          12.21            12.21   
2  1503960366     5/7/2016       11992           7.71             7.71   
3  1624580081    4/17/2016        6175           4.06             4.06   
4  1624580081     5/5/2016        2470           1.61             1.61   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0                1.96                      0.48   
1                       0.0                6.40                      0.41   
2                       0.0                2.46                      2.12   
3                       0.0                1.03                      1.52   
4                       0.0                0.00                      0.00   

   LightActiveDistance  SedentaryActiveDistance  VeryActiveMinutes  \
0                 4.24

In [5]:
def save_dataframe_to_artifacts(df: pd.DataFrame, output_dir: str, output_filename: str):
        try:
            os.makedirs(output_dir, exist_ok=True)
            file_path = os.path.join(output_dir, output_filename)

            df.to_csv(file_path, index=False)
            logger.info(f"DataFrame saved to {file_path}")

        except Exception as e:
            logger.error(f"Failed to save DataFrame: {e}")
        raise ActivityException(e, sys)

In [None]:
import pandas as pd
def load_preprocessed_data(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    print(" Data loaded from:", file_path)
    print(" Data shape:", df.shape)
    print(" Columns:", df.columns.tolist())
    return df
    

In [9]:
import pandas as pd
import random
import numpy as np

# Define the columns
columns = [
    "TotalSteps", "TotalDistance", "TrackerDistance", "VeryActiveDistance", "ModeratelyActiveDistance",
    "LightActiveDistance", "SedentaryActiveDistance", "VeryActiveMinutes", "FairlyActiveMinutes",
    "LightlyActiveMinutes", "SedentaryMinutes", "Calories", "StepsPerMinute", "DistancePerStep",
    "VeryActiveRatio", "SedentaryRatio", "ActiveMinutesTotal", "ActivityScore", "EffectiveActiveTime",
    "ActivityLevel", "Gender", "Age", "VeryActiveDistance_norm", "ModeratelyActiveDistance_norm",
    "SedentaryActiveDistance_norm", "VeryActiveMinutes_norm", "FairlyActiveMinutes_norm", "Calories_norm",
    "StepsPerMinute_norm", "DistancePerStep_norm", "VeryActiveRatio_norm"
]

# Create 1 row of random values (you can increase this by looping or changing range)
data = {
    "TotalSteps": random.randint(5000, 15000),
    "TotalDistance": round(random.uniform(3.0, 10.0), 2),
    "TrackerDistance": round(random.uniform(3.0, 10.0), 2),
    "VeryActiveDistance": round(random.uniform(0.5, 5.0), 2),
    "ModeratelyActiveDistance": round(random.uniform(0.5, 3.0), 2),
    "LightActiveDistance": round(random.uniform(1.0, 5.0), 2),
    "SedentaryActiveDistance": round(random.uniform(0.0, 1.0), 2),
    "VeryActiveMinutes": random.randint(10, 60),
    "FairlyActiveMinutes": random.randint(10, 50),
    "LightlyActiveMinutes": random.randint(100, 200),
    "SedentaryMinutes": random.randint(600, 800),
    "Calories": random.randint(1800, 2800),
    "StepsPerMinute": round(random.uniform(10, 20), 2),
    "DistancePerStep": round(random.uniform(0.0005, 0.001), 5),
    "VeryActiveRatio": round(random.uniform(0.1, 0.5), 2),
    "SedentaryRatio": round(random.uniform(0.4, 0.7), 2),
    "ActiveMinutesTotal": random.randint(100, 250),
    "ActivityScore": random.randint(50, 100),
    "EffectiveActiveTime": random.randint(50, 150),
    "ActivityLevel": random.choice(["Low", "Medium", "High"]),
    "Gender": "Male",
    "Age": 25,
    "VeryActiveDistance_norm": round(random.uniform(0.1, 1.0), 2),
    "ModeratelyActiveDistance_norm": round(random.uniform(0.1, 1.0), 2),
    "SedentaryActiveDistance_norm": round(random.uniform(0.0, 1.0), 2),
    "VeryActiveMinutes_norm": round(random.uniform(0.1, 1.0), 2),
    "FairlyActiveMinutes_norm": round(random.uniform(0.1, 1.0), 2),
    "Calories_norm": round(random.uniform(0.1, 1.0), 2),
    "StepsPerMinute_norm": round(random.uniform(0.1, 1.0), 2),
    "DistancePerStep_norm": round(random.uniform(0.1, 1.0), 2),
    "VeryActiveRatio_norm": round(random.uniform(0.1, 1.0), 2)
}

# Create DataFrame
df = pd.DataFrame([data])

# Save to CSV
df.to_csv("testdata2.csv", index=False)

# Show the DataFrame
df


Unnamed: 0,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,...,Age,VeryActiveDistance_norm,ModeratelyActiveDistance_norm,SedentaryActiveDistance_norm,VeryActiveMinutes_norm,FairlyActiveMinutes_norm,Calories_norm,StepsPerMinute_norm,DistancePerStep_norm,VeryActiveRatio_norm
0,11084,4.49,5.21,1.64,1.63,4.61,0.37,36,13,196,...,25,0.85,0.54,0.97,0.69,0.13,0.66,0.25,0.42,0.98


In [10]:
from Activity_pstructure.utils.inference_utils import (
    load_csv, load_model, 
    load_transformer, transform_data,
    make_predictions,calculate_custom_metrics
)

# Step 1: Load the raw test data
input_path = r"D:\ActivityP\Activity_pstructure\notebook\testdata2.csv"
df = load_csv(input_path)

# Step 2: Load and apply transformer
transformer = load_transformer(r"D:\ActivityP\model\transformer.pkl")
X = transform_data(df, transformer)

# Step 3: Load model and make predictions
model = load_model(r"D:\ActivityP\model\model.pkl")
preds = make_predictions(X, model)

# Step 4: Add predictions to DataFrame
df["prediction"] = preds

# Step 5: Compute custom metrics
df = calculate_custom_metrics(df)

# ✅ Step 6: Now reference the correct column names that exist
print(df[["ActivityScore", "diet_completion", "sleep_quality_rating"]].head())


   ActivityScore  diet_completion  sleep_quality_rating
0             94             35.8                    10


In [36]:
print(df.columns.tolist())

['TotalSteps', 'TotalDistance', 'TrackerDistance', 'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories', 'StepsPerMinute', 'DistancePerStep', 'VeryActiveRatio', 'SedentaryRatio', 'ActiveMinutesTotal', 'ActivityScore', 'EffectiveActiveTime', 'ActivityLevel', 'Gender', 'Age', 'VeryActiveDistance_norm', 'ModeratelyActiveDistance_norm', 'SedentaryActiveDistance_norm', 'VeryActiveMinutes_norm', 'FairlyActiveMinutes_norm', 'Calories_norm', 'StepsPerMinute_norm', 'DistancePerStep_norm', 'VeryActiveRatio_norm', 'prediction', 'activity_percentage', 'diet_met', 'sleep_quality']


In [19]:
df["activity_percentage"] = (df["ActiveMinutesTotal"] / 150) * 100
print("Before clip max:", df["activity_percentage"].max())
df["activity_percentage"] = df["activity_percentage"].clip(lower=1, upper=100)
print("After clip max:", df["activity_percentage"].max())
print(df["activity_percentage"].head())

Before clip max: 156.0
After clip max: 100.0
0    100.0
Name: activity_percentage, dtype: float64


In [3]:
def calculate_custom_metrics(df):
    required_cols = ["ActiveMinutesTotal", "Calories", "SedentaryMinutes"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: '{col}'")

    ActivityScore = (df["ActiveMinutesTotal"] / 150).clip(upper=1)
    df["ActivityScore"] = (ActivityScore * 9 + 1).round().astype(int)

    df["diet_completion"] = df["Calories"].apply(
        lambda x: 100 if x >= 2500 else max(0, (x - 2000) / 500 * 100)
    ).round(2)

    sleep_score = (df["SedentaryMinutes"] / 600).clip(upper=1)
    df["sleep_quality_rating"] = (sleep_score * 9 + 1).round().astype(int)

    return print("🧪 Columns in df after metrics:", df.columns)

