In [None]:
import pandas as pd
from tsfresh import extract_features, extract_relevant_features, feature_selection
from tsfresh.utilities.dataframe_functions import impute, roll_time_series
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from loguru import logger
from src.helper import check_df_index

class NightAnalyser:
    def __init__(self, df, id_column_name='individual_id', time_column_name='timestamp', feature_settings='comprehensive'):
        """
        Initializes the NightAnalyser with the preprocessed time series data. It is assumed the DataFrame has a MultiIndex with 'id' and 'datetime', of night periods with consistent and complete intervals between a consistent start and end time.
        :param df: Pandas DataFrame containing the time series data.
        :param feature_settings: str, 'comprehensive', 'efficient', 'minimal', or 'custom'. Defines tsfresh feature extraction settings.
        """
        df = check_df_index(df)  # Ensure the DataFrame has a MultiIndex with 'id' and 'datetime'

        self.df = df.copy()
        self.time_col = time_column_name
        self.feature_settings = self._get_feature_settings(feature_settings)
        self.night_features_df = None
        self.scaled_night_features = None
        self.night_pca_components = None
        self.night_clusters = None
        self.rolling_features_df = None

        # Store scalers and PCA models
        self.scaler = None
        self.pca_model = None

    def _get_feature_settings(self, setting_name):
        """Helper to get tsfresh feature extraction settings."""
        if setting_name == 'comprehensive':
            return ComprehensiveFCParameters()
        elif setting_name == 'efficient':
            return EfficientFCParameters()
        elif setting_name == 'minimal':
            return MinimalFCParameters()
        elif setting_name == 'custom':
            return {
            }
        else:
            raise ValueError("Invalid feature_settings. Choose 'comprehensive', 'efficient', 'minimal', or 'custom'.")

    def extract_night_level_features(self):
        """
        Extracts aggregated tsfresh features for each complete night period.
        The MultiIndex needs a unique 'night_id' for each night (e.g., individual_id + date).
        """
        print(f"Extracting night-level features using {self.feature_settings.__class__.__name__} settings...")

        # Create a 'night_id' column for tsfresh. This assumes your MultiIndex already separates nights.
        # If your MultiIndex level 0 is 'individual_id' and you have multiple nights per individual,
        # you'll need to create a unique identifier for each *night*.
        # Example: if MultiIndex is (individual_id, timestamp), extract date from timestamp.
        temp_df = self.df.reset_index()
        temp_df['night_date'] = temp_df['datetime'].dt.date
        temp_df['night_id'] = temp_df['id'].astype(str) + '_' + temp_df['night_date'].astype(str)

        # Set the night_id as the primary id for tsfresh extraction
        self.night_features_df = extract_features(
            temp_df.drop(columns=['night_date']), # Drop temporary night_date column
            column_id='night_id',
            column_sort=self.time_col,
            default_fc_parameters=self.feature_settings,
            impute_function=impute, # Apply imputation
            show_warnings=True
        )
        print(f"Extracted {self.night_features_df.shape[1]} features for {self.night_features_df.shape[0]} nights.")
        return self.night_features_df

    def preprocess_night_features(self, n_components=0.95):
        """
        Scales features and applies PCA for dimensionality reduction.

        Args:
            n_components (float or int): Number of PCA components or variance explained (0-1.0).
        """
        if self.night_features_df is None:
            raise ValueError("Night features not extracted yet. Run extract_night_level_features first.")

        print("Preprocessing night-level features (scaling and PCA)...")

        # Handle NaNs from tsfresh. You might prefer `dropna(axis=1)` if too many NaNs in a column.
        # Using impute again to catch any new NaNs from feature extraction.
        X_imputed = impute(self.night_features_df.copy())

        # Drop columns with zero variance after imputation (can cause issues with StandardScaler)
        X_imputed = X_imputed.loc[:, X_imputed.var() != 0]

        self.scaler = StandardScaler()
        self.scaled_night_features = self.scaler.fit_transform(X_imputed)
        self.scaled_night_features = pd.DataFrame(
            self.scaled_night_features,
            columns=X_imputed.columns,
            index=X_imputed.index
        )

        if n_components is not None:
            self.pca_model = PCA(n_components=n_components)
            self.night_pca_components = self.pca_model.fit_transform(self.scaled_night_features)
            print(f"PCA reduced dimensions from {self.scaled_night_features.shape[1]} to {self.night_pca_components.shape[1]}.")
            return self.night_pca_components
        else:
            return self.scaled_night_features

    def cluster_nights(self, n_clusters, plot_2d=True):
        """
        Clusters the nights using K-Means.

        Args:
            n_clusters (int): Number of clusters for K-Means.
            plot_2d (bool): Whether to plot 2D PCA for clusters.
        """
        if self.night_pca_components is None and self.scaled_night_features is None:
            raise ValueError("Features not preprocessed yet. Run preprocess_night_features first.")

        data_for_clustering = self.night_pca_components if self.night_pca_components is not None else self.scaled_night_features.values
        if data_for_clustering.shape[0] < n_clusters:
             raise ValueError(f"Number of nights ({data_for_clustering.shape[0]}) is less than n_clusters ({n_clusters}). Cannot cluster.")


        print(f"Clustering nights into {n_clusters} clusters...")
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # n_init for robustness
        self.night_clusters = kmeans.fit_predict(data_for_clustering)

        self.night_features_df['cluster_label'] = self.night_clusters
        print("Night cluster distribution:")
        print(self.night_features_df['cluster_label'].value_counts())

        if plot_2d and self.night_pca_components is not None and self.night_pca_components.shape[1] >= 2:
            plt.figure(figsize=(10, 8))
            sns.scatterplot(
                x=self.night_pca_components[:, 0],
                y=self.night_pca_components[:, 1],
                hue=self.night_clusters,
                palette='viridis',
                alpha=0.7
            )
            plt.title(f'Nights Clustered (KMeans, K={n_clusters})')
            plt.xlabel('Principal Component 1')
            plt.ylabel('Principal Component 2')
            plt.show()
        elif plot_2d and (self.night_pca_components is None or self.night_pca_components.shape[1] < 2):
            print("Cannot plot 2D PCA: PCA not performed or less than 2 components.")

        return self.night_clusters

    def get_cluster_centroids(self):
        """Returns the mean feature values for each cluster (in original feature space)."""
        if self.night_clusters is None:
            raise ValueError("Nights not clustered yet. Run cluster_nights first.")

        # Inverse transform scaled features before averaging for interpretability
        original_features_df = pd.DataFrame(
            self.scaler.inverse_transform(self.scaled_night_features),
            columns=self.scaled_night_features.columns,
            index=self.scaled_night_features.index
        )
        original_features_df['cluster_label'] = self.night_clusters
        return original_features_df.groupby('cluster_label').mean()

    def extract_rolling_window_features(self, window_size='1H', overlap=0.5):
        """
        Extracts tsfresh features from rolling windows within each original night. These features are suitable for HMM observations.
        :param window_size: (str), Rolling window size (e.g., '30min', '1H').
            overlap (float): Overlap between consecutive windows (0.0 to 1.0).
        """
        print(f"Extracting rolling window features (window={window_size}, overlap={overlap})...")

        df_flat = self.df.reset_index()

        df_flat['night_id_temp'] = df_flat[self.id_col].astype(str) + '_' + df_flat[self.time_col].dt.date.astype(str)

        rolled_df = roll_time_series(
            df_flat,
            column_id="night_id_temp", # Each night is its own ID for rolling
            column_sort=self.time_col,
            min_timeseries_length=pd.Timedelta(window_size), # Convert string to Timedelta
            max_timeseries_length=pd.Timedelta(window_size),
            rolling_direction=1, # Roll forward
            # This is where the overlap happens:
            # We need to calculate step based on window_size and overlap
            # A 1-hour window with 0.5 overlap means step is 0.5 hours.
            # Convert window_size string to Timedelta for calculation
            rolling_direction_in_consideration = pd.Timedelta(window_size) * (1 - overlap) # This is effectively the step
        )

        print(f"Rolled into {len(rolled_df['night_id_temp'].unique())} unique night-windows.")


        # Now extract features from the rolled segments
        self.rolling_features_df = extract_features(
            rolled_df.drop(columns=['night_id_temp']), # temp night ID is now 'id' for extract_features
            column_id='id', # This is the internal ID created by roll_time_series for each segment
            column_sort=self.time_col,
            default_fc_parameters=self.feature_settings,
            impute_function=impute,
            show_warnings=True
        )

        # The index of rolling_features_df will be a MultiIndex: (original_night_id, end_of_window_timestamp)
        # Example: (('ind1_2018-03-16', Timestamp('2018-03-16 21:30:00')), ...)
        # You'll likely want to extract the original night_id and window timestamp for later use.
        # The 'id' column from roll_time_series will be the first level of the index.
        # The 'time' column will be the second level of the index.

        # Let's rename the index levels for clarity
        self.rolling_features_df.index.set_names(['original_night_id', 'window_end_time'], inplace=True)

        print(f"Extracted {self.rolling_features_df.shape[1]} features for {self.rolling_features_df.shape[0]} rolling windows.")
        return self.rolling_features_df

    def get_hmm_ready_data(self):
        """
        Prepares the rolling window features for HMM training.
        :return: Dictionary where keys are cluster labels and values are lists of arrays (sequences of feature vectors) for HMM training.
        """
        if self.rolling_features_df is None or self.night_clusters is None:
            raise ValueError("Rolling features not extracted or nights not clustered. Run respective methods first.")

        hmm_data_by_cluster = {cluster_id: [] for cluster_id in np.unique(self.night_clusters)}

        # Get original night IDs and their assigned clusters
        night_to_cluster_map = self.night_features_df['cluster_label'].to_dict()

        # Iterate through the rolling features, group by original night, and assign to cluster
        for original_night_id, group_df in self.rolling_features_df.groupby(level='original_night_id'):
            cluster_id = night_to_cluster_map.get(original_night_id)
            if cluster_id is not None:
                # Ensure the sequence is sorted by time for HMM
                sequence_data = group_df.sort_index(level='window_end_time').values
                hmm_data_by_cluster[cluster_id].append(sequence_data)
            else:
                print(f"Warning: Original night ID {original_night_id} not found in clustered nights. Skipping.")

        # Scale rolling features (important for HMMs too, often with StandardScaler)
        # You might want a separate scaler for rolling features, or fit one globally
        # to all rolling features
        scaler_rolling = StandardScaler()
        # Flatten all sequences to fit the scaler, then transform them back
        all_sequences_flat = np.vstack([seq for sequences in hmm_data_by_cluster.values() for seq in sequences])
        scaler_rolling.fit(all_sequences_flat)

        for cluster_id, sequences in hmm_data_by_cluster.items():
            hmm_data_by_cluster[cluster_id] = [scaler_rolling.transform(seq) for seq in sequences]

        return hmm_data_by_cluster


#    # If your multiindex isn't named, you'd need to set them for the class constructor
#    # df.index.names = ['individual_id', 'timestamp']

# Let's create a dummy DataFrame that matches your description for demonstration

ids = [221634]*300 + [99908129]*100 + [12345]*100 # 3 nights for ind 221634, 1 for 99908129, 1 for 12345
data = np.random.rand(500, 15).astype(np.float32)
# Introduce some NaNs for demonstration (mimics your non-null counts)
data[:, [0, 3, 6, 9]] = np.where(np.random.rand(500, 4) < 0.05, np.nan, data[:, [0, 3, 6, 9]])
data[:, [1, 4, 7, 10]] = np.where(np.random.rand(500, 4) < 0.1, np.nan, data[:, [1, 4, 7, 10]])
# Ensure counts are integers
data[:, 12:15] = np.random.randint(1, 10, (500, 3))

df_dummy = pd.DataFrame(data, columns=[
    'iob mean', 'cob mean', 'bg mean', 'iob min', 'cob min', 'bg min',
    'iob max', 'cob max', 'bg max', 'iob std', 'cob std', 'bg std',
    'iob count', 'cob count', 'bg count'
])
df_dummy['individual_id'] = ids
df_dummy['timestamp'] = dates

df_dummy = df_dummy.set_index(['individual_id', 'timestamp'])
df_dummy = df_dummy.sort_index() # Good practice for MultiIndex

# Now use the class
analyzer = NightAnalyser(df_dummy, id_column_name='individual_id', time_column_name='timestamp')

# 2. Extract Night-Level Features
night_features = analyzer.extract_night_level_features()
print("\nNight-level features extracted:")
print(night_features.head())

# 3. Preprocess Night-Level Features (Scale and PCA)
#    You might want to iterate on n_components for PCA by checking explained variance ratio
#    plt.plot(np.cumsum(analyzer.pca_model.explained_variance_ratio_))
#    plt.xlabel('number of components')
#    plt.ylabel('cumulative explained variance')
#    plt.show()
pca_features = analyzer.preprocess_night_features(n_components=0.95)
print("\nPreprocessed PCA features for nights:")
print(pca_features[:5])

# 4. Cluster Nights
#    You'll typically use the Elbow Method/Silhouette Score (as shown in previous answer)
#    to determine the optimal n_clusters before running this.
n_clusters_chosen = 3 # Example
analyzer.cluster_nights(n_clusters=n_clusters_chosen)
print("\nNight clusters assigned.")

# Get characteristics of clusters
cluster_centroids = analyzer.get_cluster_centroids()
print("\nCluster Centroids (mean feature values in original scale):")
print(cluster_centroids)

# 5. Extract Rolling Window Features for HMM
#    Window size and overlap are critical here. Choose based on biological/clinical relevance
#    and desired temporal resolution for your HMM states.
#    e.g., if states change every 15-30 minutes, a 30min or 1H window with overlap is good.
rolling_features = analyzer.extract_rolling_window_features(window_size='1H', overlap=0.5)
print("\nRolling window features extracted:")
print(rolling_features.head())

# 6. Prepare data for HMM
hmm_data = analyzer.get_hmm_ready_data()
print("\nData prepared for HMM training by cluster:")
for cluster_id, sequences in hmm_data.items():
    print(f"  Cluster {cluster_id}: {len(sequences)} sequences, e.g., first sequence shape: {sequences[0].shape}")

# Now you would typically iterate through hmm_data.items() and train an HMM for each cluster_id
# using a library like `hmmlearn`.