In [1]:
import os

In [2]:
%pwd

'd:\\SAMITH\\Github\\Heart_Rate_Anomaly_Detector\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\SAMITH\\Github\\Heart_Rate_Anomaly_Detector'

In [5]:
from dataclasses import dataclass
from pathlib import Path


In [7]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path_reading: Path
    data_path_users: Path

In [8]:
from Heart_Rate_Anomaly_Detector.constants import *
from Heart_Rate_Anomaly_Detector.utils.common import read_yaml, create_directories

In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path_reading=config.data_path.readings,
            data_path_users=config.data_path.users,
        )

        return data_transformation_config    

In [11]:
import pandas as pd
import os
import numpy as np
from pathlib import Path
from Heart_Rate_Anomaly_Detector import logger
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
import joblib
import warnings

In [None]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.scaler = None
        self.imputers = {}
        
    def load_and_preprocess_data(self):
       
        logger.info("Loading and preprocessing data...")
        
        df1 = pd.read_csv(self.config.data_path_reading)
        df2 = pd.read_csv(self.config.data_path_users)
        
        df = pd.merge(df1, df2, on='user_id')
        logger.info(f"Loaded data shape: {df.shape}")
        
       
        df = df.sort_values('date')
        
        
        df = df.infer_objects()
        
        return df
    
    def handle_missing_values(self, df):
        
        df = df.copy()
        logger.info("Handling missing values...")
        
        
        missing_before = df.isnull().sum().sum()
        logger.info(f"Missing values before processing: {missing_before}")
        
       
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
        
     
        if 'date' in categorical_cols:
            categorical_cols.remove('date')
        
      
        if numeric_cols:
           
            df_with_date = df.set_index('date') if 'date' in df.columns else df
            df_with_date[numeric_cols] = df_with_date[numeric_cols].interpolate(method='time')
            df = df_with_date.reset_index() if 'date' in df_with_date.index.names else df_with_date
            
            
            df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        
        
        if categorical_cols:
            for col in categorical_cols:
                df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown')
        
        missing_after = df.isnull().sum().sum()
        logger.info(f"Missing values after processing: {missing_after}")
        
        return df
    
    def create_advanced_features(self, df):
        
        df = df.copy()
        logger.info("Creating advanced features...")
        
        
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            
            
            if 'month' not in df.columns:
                df['month'] = df['date'].dt.month
            if 'day' not in df.columns:
                df['day'] = df['date'].dt.day
            if 'year' not in df.columns:
                df['year'] = df['date'].dt.year
            if 'day_of_week' not in df.columns:
                df['day_of_week'] = df['date'].dt.dayofweek
            if 'hour' not in df.columns and df['date'].dt.hour.nunique() > 1:
                df['hour'] = df['date'].dt.hour
            if 'is_weekend' not in df.columns:
                df['is_weekend'] = (df['date'].dt.dayofweek >= 5).astype(int)
            if 'day_of_year' not in df.columns:
                df['day_of_year'] = df['date'].dt.dayofyear
            
            
            df['quarter'] = df['date'].dt.quarter
            df['week_of_year'] = df['date'].dt.isocalendar().week
            
            
            df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
            df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
            df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
            df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
            
            if 'hour' in df.columns:
                df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
                df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
            
           
            df['is_winter'] = df['month'].isin([12, 1, 2]).astype(int)
            df['is_summer'] = df['month'].isin([6, 7, 8]).astype(int)
            df['is_spring'] = df['month'].isin([3, 4, 5]).astype(int)
            df['is_autumn'] = df['month'].isin([9, 10, 11]).astype(int)
        
        
                
        logger.info(f"Features created. New shape: {df.shape}")
        return df
    
      
     
    def train_test_splitting(self):
        
        logger.info("Starting comprehensive data transformation pipeline...")
        
        
        df = self.load_and_preprocess_data()
        
                
       
        df = self.handle_missing_values(df)
        
        
        df = self.create_advanced_features(df)
               
       
        logger.info("Performing train-test split...")
        
       
        df = df.sort_values('date') if 'date' in df.columns else df
        
        
        split_ratio = getattr(self.config, 'train_split_ratio', 0.75)
        split_index = int(len(df) * split_ratio)
        
        train = df.iloc[:split_index].copy()
        test = df.iloc[split_index:].copy()
        
        
        if 'date' in train.columns:
            train = train.reset_index(drop=True)
            test = test.reset_index(drop=True)
        
       
        os.makedirs(self.config.root_dir, exist_ok=True)
        
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)
        
        
        if self.scaler is not None:
            scaler_path = os.path.join(self.config.root_dir, "scaler.joblib")
            joblib.dump(self.scaler, scaler_path)
            logger.info(f"Scaler saved to: {scaler_path}")
        
   
        logger.info("=== Train-Test Split Summary ===")
        logger.info(f"Total samples: {len(df):,}")
        logger.info(f"Training samples: {len(train):,} ({len(train)/len(df)*100:.1f}%)")
        logger.info(f"Test samples: {len(test):,} ({len(test)/len(df)*100:.1f}%)")
        logger.info(f"Features: {train.shape[1]}")
        logger.info(f"Train date range: {train['date'].min()} to {train['date'].max()}" if 'date' in train.columns else "No date column")
        logger.info(f"Test date range: {test['date'].min()} to {test['date'].max()}" if 'date' in test.columns else "No date column")
        
        
        if hasattr(self.config, 'target_column') and self.config.target_column in train.columns:
            numeric_features = train.select_dtypes(include=[np.number]).columns.tolist()
            if self.config.target_column in numeric_features:
                numeric_features.remove(self.config.target_column)
            
            if len(numeric_features) > 0:
                correlations = train[numeric_features + [self.config.target_column]].corr()[self.config.target_column].abs().sort_values(ascending=False)[1:11]
                logger.info("Top 10 features by correlation with target:")
                for feature, corr in correlations.items():
                    logger.info(f"  {feature}: {corr:.3f}")
        
        logger.info("Data transformation pipeline completed successfully!")
        
       
        return train, test

In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e