# Dependencies

In [1]:
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import metrics
import pandas as pd
import numpy as np
import warnings
import random

# Ignore all the future warnings
warnings.filterwarnings('ignore')

# Sample Specifications

In [2]:
total_patients            = 100
total_days                = 365
required_sample_count     = 50
maximum_surveys_attempted = 7
seed                      = 2023

# PHQ-9 Score Generation

In [3]:
def generate_phq9_samples(time_index):
    """
    
    """
    try:
        # Specify lower and upper bound of PHQ-9 scores depending on the time_index
        if (1 <= time_index <= 5):
            upper_bound   = 27
            lower_bound   = 10
            probabilities = [0.056, 0.056, 0.056, 0.056, 0.028, 0.028, 0.028, 0.028, 0.014, 0.014, 0.014, 0.014, 0.007, 0.007, 0.007, 0.0035, 0.0035, 0.00175]
        elif (6 <= time_index <= 10):
            upper_bound   = 19
            lower_bound   = 10
            probabilities = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] 
        elif (11 <= time_index <= 20):
            upper_bound   = 19
            lower_bound   = 5
            probabilities = [0.06, 0.06, 0.06, 0.03, 0.03, 0.03, 0.015, 0.015, 0.015, 0.0075, 0.0075, 0.0075, 0.00375, 0.00375, 0.00375] 
        elif (21 <= time_index <= 30):
            upper_bound   = 19
            lower_bound   = 0
            probabilities = [0.05, 0.025, 0.0125, 0.00625, 0.003125, 0.00160625, 0.000803125, 0.0004015625, 0.00020078125, 0.000100390625, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5, 5.01953125e-5]
        elif (31 <= time_index <= 40):
            upper_bound   = 14
            lower_bound   = 0
            probabilities = [0.06667, 0.03333, 0.01667, 0.00833, 0.004167, 0.00208, 0.00104, 0.00052, 0.00026, 0.00013, 6.51041e-5, 3.25521e-5, 1.62760e-5, 8.13802e-6, 4.06901e-6]
        elif (41 <= time_index <= 100):
            upper_bound   = 14
            lower_bound   = 0
            probabilities = [0.06667, 0.03333, 0.01667, 0.00833, 0.004167, 0.00208, 0.00104, 0.00052, 0.00026, 0.00013, 6.51041e-5, 3.25521e-5, 1.62760e-5, 8.13802e-6, 4.06901e-6]
        elif (101 <= time_index <= 220):
            upper_bound   = 9
            lower_bound   = 0
            probabilities = [0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.025]
        elif (221 <= time_index <= 300):
            upper_bound   = 9
            lower_bound   = 0
            probabilities = [0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.025]
        elif (301 <= time_index <= 365):
            upper_bound   = 4
            lower_bound   = 0
            probabilities = [0.5, 0.25, 0.15, 0.1, 0.05]

        # Generating a list of all possible cases of PHQ-9 scores i.e Population within the possible range of variation 
        population     = list(np.arange(start = lower_bound, 
                                        stop  = (upper_bound+1),
                                        step  = 1,
                                        dtype = int))

        # Now draw sample of specified size from the above generated population and drawing should be done using Simple
        # Random Sampling with replacement as more than one person may have same PHQ-9 score

        required_sample = np.random.choice(a       = population,
                                           replace = True, 
                                           size    = 1,
                                           p       = [prob/sum(probabilities) for prob in probabilities])

        return required_sample 
    
    except Exception as PHQ9GenerationError:
        return PHQ9GenerationError


# Allotment of patients, survey days and surveys

In [4]:
def sample_allocations(total_patients, total_days, required_sample_count, maximum_surveys_attempted):
    """
    
    """
    if (required_sample_count > total_patients):
        return repr(ValueError('Total number of patients should be greater than required sample count'))
    
    try:
        # Set random seed for reproducibility
        np.random.seed(seed)
        random.seed(seed)
        # Make a list of all possible time stamps
        days_list           = list(np.arange(start = 1,
                                             stop  = (total_days+1),
                                             step  = 1,
                                             dtype = int))
        
        patients_list       = list(np.arange(start = 1,
                                             stop  = (total_patients+1),
                                             step  = 1,
                                             dtype = int))
        
        # Assign probabilities for each day of getting selected
        probabilities       = [0.99]*1 + [0.85]*4 + [0.85]*10 + [0.7]*5 + [0.5]*20 + [0.3]*20 + [0.25]*60 + [0.125]*60 + [0.0625]*60 + [0.03125]*60 + [0.015625]*62 + [0.95]*3
        
        # Select desired number of samples randomly from the all possible cases
        desired_sample_days = sorted(np.random.choice(a       = days_list,
                                                      size    = required_sample_count,
                                                      replace = False,
                                                      p       = [probability / sum(probabilities) for probability in probabilities]))
        
        # Create an empty dataframe
        output_dataframe    = pd.DataFrame(index=[f"Day_{day}" for day in desired_sample_days])
        
        # Generate data for each patient
        for day in desired_sample_days:
            # Calculate the number of patients attempting surveys for the given day
            patients_attempting_survey = int(total_patients * (2 ** (-day / 365)))

            # Randomly select the patients attempting surveys
            random_patient_attempting  = sorted(random.sample(range(1, total_patients + 1), patients_attempting_survey))
            # Generate PHQ-9 scores for each patient
            for patient in random_patient_attempting:
                # Randomly select the number of surveys attempted by the patient
                number_of_surveys_attempted = random.randint(1, maximum_surveys_attempted)

                for survey_count in range(number_of_surveys_attempted):
                    # Calculate the PHQ-9 score
                    score = generate_phq9_samples(time_index=day)
                    
                    # Skip adding score if survey_count exceeds 6
                    if (survey_count >= maximum_surveys_attempted):
                        continue
                    
                    # Add the score to the dataframe
                    column_name                                     = f"Patient_{patient}"
                    output_dataframe.loc[f"Day_{day}", column_name] = score[0]
        
        
        # For each patients' select only surveys attempted between a defined range
        for column in output_dataframe.columns:
            # Get the non-null values in the column
            non_nan_values = output_dataframe[column].dropna().values
            # Check if there are any non-null values
            if (len(non_nan_values) > 0):
                # Randomly select a range between 1 and 7
                num_cells        = np.random.randint((maximum_surveys_attempted-2), maximum_surveys_attempted)
        
                # Randomly select indices from non-null values
                selected_indices = np.random.choice(a       = len(non_nan_values), 
                                                    size    = num_cells, 
                                                    replace = False)
        
                # Create a mask with True values for the selected indices
                mask = np.zeros(len(output_dataframe), dtype=bool)
                mask[np.flatnonzero(output_dataframe[column].notna())[selected_indices]] = True
        
                # Set all other cells in the column as NaN
                output_dataframe.loc[~mask, column] = np.nan
                
        # Set the index name
        output_dataframe.index.name = 'Day'
        column_order                = [f'Patient_{i}' for i in range(1, (total_patients+1))]
        final_output_dataframe      = output_dataframe[column_order]
        
        return final_output_dataframe
    
    except Exception as SampleAllocationError:
        return repr(f'SampleAllocationError : {SampleAllocationError}').replace('  ', '')
    

# Plotting the PHQ-9 Scores' Scatter Plot for different Query Days 

In [5]:
def scatterDiagram_phq9Scores_all_queryDays(input_data : pd.DataFrame) -> None:
    """
    
    """
    if (not isinstance(input_data, pd.DataFrame)):
        return repr(TypeError(f'Expected a pandas Dataframe for the argument : input_data,\
                                got : {type(input_data)} instead')).replace('  ', '')
    try:
        # Fix the size of the plot
        fig, ax   = plt.subplots(figsize=(25, 10))

        # Extract and format data
        data_dict = input_data.T.to_dict()
        keys      = list(data_dict.keys())
        values    = list()
        for key in keys:
            scores = [x for x in list(data_dict[key].values()) if ~np.isnan(x)]
            values.append(scores)

        # Calculate the cluster sizes based on the number of values in each cluster
        circle_sizes = [len(value) for value in values]
        
        # Plot the PHQ-9 scores for each day along with the color scale based 
        # on the weightages (Frequencies) of each scores
        for key, value, size in zip(keys, values, circle_sizes):
            color = np.linspace(start = 0, 
                                stop  = 1, 
                                num   = size)
            
            ax.scatter(x    = [key]*size, 
                       y    = [element for element in value],
                       c    = color,
                       cmap = 'viridis',
                       s    = 25)

        # Set the color scale for PHQ-9 Score Frequency in each day
        sm   = plt.cm.ScalarMappable(cmap = 'viridis')
        cbar = plt.colorbar(mappable = sm, cax = sm.set_array([]))
        cbar.set_label('Weightages')
        
        # Set the x-axis labels and ticks
        plt.xticks(ticks    = range(len(keys)),
                   labels   = keys,
                   fontsize = 10,
                   rotation = 90)
        
        # Set the y-axis labels and ticks
        plt.yticks(ticks    = np.arange(0, 31, 1),
                   fontsize = 10,
                   rotation = 0)
        
        # Set Axis Labels and Title of the plot 
        plt.xlabel(xlabel   = 'Time',
                   fontsize = 15)
        
        plt.ylabel(ylabel   = 'PHQ-9 Scores',
                   fontsize = 15)
        
        plt.title(label    = 'Scatter Plot of PHQ-9 Scores for Given Sample\n',
                  fontsize = 20)
        
        plt.tight_layout(pad   = 2.0,
                         h_pad = 1.0,
                         w_pad = 1.0)
        plt.show()
        
    except Exception as PHQ9ScoresAllPlottingError:
        return repr(f'PHQ9ScoresAllPlottingError : {PHQ9ScoresAllPlottingError} while Plotting Scatter diagram\
                      for PHQ-9 scores of all study individuals for all study days').replace('  ', '')
                    

# Plotting Daily Average for all study patients' PHQ-9 Scores for Discrete Days

In [6]:
def daily_average_phq9_line_plot(input_data : pd.DataFrame) -> None:
    """
    
    """
    if (not isinstance(input_data, pd.DataFrame)):
        return repr(TypeError(f'Expected a pandas DataFrame for the argument : input_data, \
                                got : {type(input_data)} instead')).replace('  ', '')
    
    try:
        # Calculate Daily Average PHQ-9 Scores of 100 observed patients
        daily_average_scores = input_data.mean(axis   = 1, 
                                               skipna = True)

        # Set the figure size
        plt.figure(figsize = (25, 10))
        
        # Plot the Daily Average PHQ-9 scores by Line Digram
        plt.plot(daily_average_scores.index, 
                 daily_average_scores.values,
                 scalex     = True,
                 scaley     = True,
                 marker     = 'o',
                 markersize = 5.0,
                 linestyle  = '-',
                 linewidth  = 2.0)

        plt.xlabel(xlabel   = 'Survey Day (Time)',
                   fontsize = 15)

        plt.ylabel(ylabel   = 'Average PHQ-9 Score',
                   fontsize = 15)

        plt.yticks(ticks    = np.arange(0, 31, 1),
                   fontsize = 10)

        plt.xticks(ticks    = daily_average_scores.index,
                   fontsize = 10,
                   rotation = 90)

        plt.title(label    = 'Trend Line: Average PHQ-9 Score Over 365 Days\n',
                  fontsize = 20)

        plt.grid(True)
        
        plt.tight_layout(pad   = 2.0,
                         h_pad = 1.0,
                         w_pad = 1.0)
        plt.show()
        
    except Exception as DailyAverageLinePlotError:
        return repr(f'DailyAverageLinePlotError : {DailyAverageLinePlotError}')


# Temporal Clustering

## a. Obtain the optimal number of clusters 

In [7]:
def kmeans_temporal_clustering(input_data : pd.DataFrame, max_clusters:int) -> None:
    """
    Perform temporal clustering using K-means algorithm and determine the optimal number of clusters
    
    Arguments:
    ----------
        input_data  {DataFrame} : Input data for clustering, where rows represent scores and columns
                                  represent days
        
        max_clusters   {int}    : Maximum number of clusters to consider
        
        
    Errors:
    -------
    
    
    Returns:
    --------
    
    """
    if (not isinstance(input_data, pd.DataFrame)):
        return repr(TypeError(f'Expected a pandas DataFrame for the argument : input_data,\
                              got : {type(input_data)} instead')).replace('  ', '')
    
    if (not isinstance(max_clusters, int)):
        return repr(TypeError(f'Expected an integer for the argument : max_clusters, got :\
                                {type(max_clusters)} instead')).replace('  ', '')
    
    try:
        # Transpose the input_data to have scores as rows and days as columns
        required_dataframe = input_data.T
        # Fill the nan values with -1 for computations
        required_dataframe = required_dataframe.fillna(value = -1,
                                                       axis  = 0)
        
        # Initialize lists to save results from each iteration
        inertia_list      = list()
        silhouette_scores = list()
        
        # Iterate over different number of clusters
        for n_clusters in range(2, (max_clusters+2)):
            # Instantiate the K-Means model
            kmeans           = cluster.KMeans(n_clusters   = n_clusters,
                                              random_state = 1234)
            # Fit the model on the dataset
            kmeans.fit(required_dataframe)
            
            # Calculate the inertia for the fitted model
            inertia          = kmeans.inertia_
            
            # Calculate the Silhoutte score for checking goodness of fit of the model
            silhouette_score = metrics.silhouette_score(X      = required_dataframe,
                                                        labels = kmeans.labels_)
            # Finally dump the inertia and goodness-of-fit score in their corresponding lists
            inertia_list.append(inertia)
            silhouette_scores.append(silhouette_score)
        
        
        # Elbow Method - Percentage change in inertia
        inertia_percent_change = [100 * (inertia_list[i] - inertia_list[i-1]) / inertia_list[i-1] for i in range(1, len(inertia_list))]
        elbow_point            = inertia_percent_change.index(min(inertia_percent_change)) + 2  # Add 2 to account for starting from 2 clusters

        # Silhouette Analysis - Optimal number of clusters
        optimal_clusters       = silhouette_scores.index(max(silhouette_scores)) + 2  # Add 2 to account for starting from 2 clusters

        # Plot the inertia values to identify the elbow point
        # Fix the size of the plot
        fig, ax   = plt.subplots(figsize=(25, 10))
        x_coordinates = np.arange(start = 1,
                                  stop  = max_clusters+1,
                                  step  = 1,
                                  dtype = int).tolist()
        y_coordinates = inertia_list

        # Draw the Line Plot
        plt.plot(x_coordinates,
                 y_coordinates,
                 scalex     = True,
                 scaley     = True,
                 marker     = 'o',
                 markersize = 5.0,
                 linestyle  = '-',
                 linewidth  = 2.0)
        # Decorators
        # Set Ticks and Labels
        plt.xticks(ticks    = np.arange(start = (np.min(x_coordinates)-1), 
                                        stop  = (np.max(x_coordinates)+2),
                                        step  = 1,
                                        dtype = int),
                   fontsize = 10,
                   rotation = 0)
        
        plt.yticks(fontsize = 10,
                   rotation = 0)
                       
        plt.xlabel(xlabel   = 'Number of Clusters',
                   fontsize = 12)
        
        plt.ylabel(ylabel   = 'Inertia',
                   fontsize = 12)
        
        plt.title(label    = 'Optimal Cluster Finding: Elbow Method\n',
                  fontsize = 20)
        
        plt.show()
        

        # Plot the Silhoutte scores to identify the optimal number of clusters
        # Fix the size of the plot
        fig, ax   = plt.subplots(figsize=(25, 10))
        x_coordinates = np.arange(start = 1,
                                  stop  = max_clusters+1,
                                  step  = 1,
                                  dtype = int).tolist()
        y_coordinates = silhouette_scores

        # Draw the Line Plot
        plt.plot(x_coordinates,
                 y_coordinates,
                 scalex     = True,
                 scaley     = True,
                 marker     = 'o',
                 markersize = 5.0,
                 linestyle  = '-',
                 linewidth  = 2.0)
        # Decorators
        # Set Ticks and Labels
        plt.xticks(ticks    = np.arange(start = (np.min(x_coordinates)-1), 
                                        stop  = (np.max(x_coordinates)+2),
                                        step  = 1,
                                        dtype = int),
                   fontsize = 10,
                   rotation = 0)
        
        plt.yticks(fontsize = 10,
                   rotation = 0)
                       
        plt.xlabel(xlabel   = 'Number of Clusters',
                   fontsize = 12)
        
        plt.ylabel(ylabel   = 'Silhouette Score',
                   fontsize = 12)
        
        plt.title(label    = 'Optimal Cluster Finding: Silhouette Score Method\n',
                  fontsize = 20)
        
        plt.show()
        
        return elbow_point, optimal_clusters
    except Exception as KMeansClusteringError:
        return repr(f'KMeansClusteringError : Got : {KMeansClusteringError} while performing\
                      Temporal clustering by K-Means method').replace('  ', '')
     

## b. Fit the Clustering Models with the optimal number of Clusters obtained 

In [8]:
def fit_clustering_model(input_data, optimal_clusters):
    """
    Fit the clustering model with the specified number of clusters and plot the results
    
    Arguments:
    ----------
        input_data {DataFrame} : Input data with days as columns and scores as rows
        
        n_clusters    {int}    : Number of optimal clusters
        
    Errors:
    -------
        TypeError              : Error occurs if any of the input argument is not of its
                                 defined data type
        
        ClusterFittingError    : Error occurs if any exception happens while fitting the 
                                 Clustering model with the precomputed number of clusters
                                 on the input dataset to obtain the cluster labels

    Returns:
    --------
             {np.array}        : A numpy array containing the cluster labels for all the 
                                 input data points
    """
    if (not isinstance(input_data, pd.DataFrame)):
        return repr(TypeError(f'Expected a pandas DataFrame for the argument : input_data, got\
                                : {type(input_data)} instead')).replace('  ', '')
    
    if (not isinstance(optimal_clusters, int)):
        return repr(TypeError(f'Expected an integer for the argument : optimal_clusters, got : \
                                {type(optimal_clusters)} instead')).replace('  ', '')
    
    try:
        # Prepare the input data in required Format and then do missing value treatment
        required_data  = input_data.T.fillna(value = -1,
                                             axis  = 0)
        
        kmeans         = cluster.KMeans(n_clusters   = optimal_clusters,
                                        random_state = 1234)
        kmeans.fit(required_data.T)
        cluster_labels = kmeans.labels_.tolist()
        return cluster_labels
    
    except Exception as ClusterFittingError:
        return repr(f'ClusterFittingError : Got : {ClusterFittingError} while Fitting clusters\
                      on the original dataset').replace('  ', '')
    

## c. Plot the Clusters along with the original Dataset

In [9]:
def plot_clusters(input_data: pd.DataFrame, n_clusters: int, cluster_labels: list) -> None:
    """
    Plot the input data with clusters separated by dotted vertical lines

    Arguments:
    ----------
        input_data      {DataFrame} : Input data with Patients as columns and PHQ-9 scores 
                                      of different days as rows
        
        n_clusters         {int}    : Number of optimal clusters
        
        cluster_labels     {list}   : Cluster labels assigned to each data point
        
    Errors:
    -------
        TypeError                   : Error occurs if any of the input argument is not of
                                      defined data type
                                      
        ClusterPlottingError        : Error occurs if any exception occurs while plotting
                                      the clusters on the original dataset
    
    Returns:
    --------
                  {None}            : A None type object
    """
    if not isinstance(input_data, pd.DataFrame):
        return repr(TypeError(f'Expected a pandas DataFrame for the argument: input_data, got: \
                                {type(input_data)} instead')).replace('  ', '')
    
    if not isinstance(n_clusters, int):
        return repr(TypeError(f'Expected an integer for the argument: n_clusters, got: \
                                {type(n_clusters)} instead')).replace('  ', '')
    
    if not isinstance(cluster_labels, list):
        return repr(TypeError(f'Expected a Python list for the argument: cluster_labels, got: \
                                {type(cluster_labels)} instead')).replace('  ', '')
    
    try:
        required_data = input_data.T
        fig, ax = plt.subplots(figsize=(10, 6))
        
        # Plot PHQ-9 scores over days with color scale for score frequencies
        for i in range(required_data.shape[0]):
            scores = required_data.iloc[i]
            color_scale = scores.value_counts(normalize=True)
            colors = [color_scale.get(score, 0) for score in scores]
            plt.scatter(required_data.columns, scores, c=colors, cmap='hot', marker='o', label=f'Score {i+1}')
        
        # Plot vertical lines to separate clusters
        cluster_boundaries = np.where(cluster_labels[:-1] != cluster_labels[1:])[0]
        unique_labels = np.unique(cluster_labels)
        colors = plt.cm.nipy_spectral(np.linspace(0, 1, len(unique_labels)))
        for boundary in cluster_boundaries:
            cluster_label = cluster_labels[boundary]
            color = colors[np.where(unique_labels == cluster_label)[0][0]]
            plt.axvline(x=boundary, linestyle='dotted', color=color)

        plt.xlabel('Days')
        plt.ylabel('PHQ-9 Score')
        plt.title(f'Clustering Results: {n_clusters} Clusters')
        plt.xticks(rotation=90)
        plt.colorbar(label='Score Frequency')
        plt.show()
        
    except Exception as ClusterPlottingError:
        return repr(f'ClusterPlottingError: Got {ClusterPlottingError} while plotting clusters \
                      on the original dataset').replace('  ', '')


# Drivers

### Driver for generating samples

In [None]:
sample_data = sample_allocations(total_patients            = total_patients,
                                 total_days                = total_days,
                                 required_sample_count     = required_sample_count,
                                 maximum_surveys_attempted = maximum_surveys_attempted)

sample_data

In [None]:
sample_data.to_csv('../data/PHQ_9_sample_dataset.csv')

### Summary Statistics

In [None]:
summary_statistics = sample_data.T.describe().T

In [None]:
summary_statistics[:50]

### Scatter Diagram : Day to Day Distribution of PHQ-9 Scores with Patient level granularity

In [None]:
# Plot the sample raw data on a scatter diagram
scatterDiagram_phq9Scores_all_queryDays(input_data = sample_data)

### Trend Line :  Daily Average of PHQ-9 Sample over a time-period of 365 days

In [None]:
daily_average_phq9_line_plot(input_data = sample_data)

### Temporal Clustering : Initialization

In [None]:
elbow_point, optimal_clusters = kmeans_temporal_clustering(input_data   = sample_data,
                                                           max_clusters = 20)

In [None]:
print (f'Elbow Point         : {elbow_point}')
print (f'Silhouette Clusters : {optimal_clusters}')


In [None]:
# Fit the K-Means model with Silhouette Optimal Number of Clusters
cluster_labels = fit_clustering_model(input_data       = sample_data,
                                      optimal_clusters = optimal_clusters)

In [None]:
# Plot Clusters
plot_clusters(input_data     = sample_data,
              n_clusters     = optimal_clusters,
              cluster_labels = cluster_labels)

In [None]:
from scipy.stats import entropy
from sklearn.cluster import OPTICS
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

def fit_clustering_model_1(input_data, min_samples=2):
    """
    Fit the clustering model with consideration of temporal continuity and plot the results
    
    Arguments:
    ----------
        input_data      {DataFrame} : Input data with days as columns and scores as rows
        min_samples     {int}       : Minimum number of samples required for clustering (default: 2)
        
    Errors:
    -------
        TypeError                   : Error occurs if any of the input argument is not of defined data type
        ClusterFittingError         : Error occurs if any exception happens while fitting the clustering model
        
    Returns:
    --------
        cluster_labels   {np.array} : A numpy array containing the cluster labels for all the input data points
    """
    if not isinstance(input_data, pd.DataFrame):
        return repr(TypeError(f"Expected a pandas DataFrame for the argument: input_data, got: {type(input_data)} instead")).replace('  ', '')

    try:
        # Sort the input data by dates
        input_data = input_data.sort_index(axis=1)
        
        # Impute missing values with the mean
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        input_data_imputed = pd.DataFrame(imputer.fit_transform(input_data), columns=input_data.columns, index=input_data.index)
        
        # Normalize the input data
        scaler = MinMaxScaler()
        input_data_normalized = pd.DataFrame(scaler.fit_transform(input_data_imputed), columns=input_data_imputed.columns, index=input_data_imputed.index)
        
        # Calculate pairwise distances between consecutive days using KL divergence
        pairwise_dists = np.zeros((input_data_normalized.shape[1]-1, input_data_normalized.shape[1]-1))
        for i in range(input_data_normalized.shape[1]-1):
            for j in range(i+1, input_data_normalized.shape[1]):
                dist = entropy(input_data_normalized.iloc[:, i], input_data_normalized.iloc[:, j])
                pairwise_dists[i, j-1] = dist
                pairwise_dists[j-1, i] = dist

        # Fit OPTICS clustering model
        optics = OPTICS(min_samples=min_samples, metric="precomputed")
        optics.fit(pairwise_dists)

        # Obtain the cluster labels
        cluster_labels = optics.labels_

        # Plot the scatter plot of PHQ-9 scores with cluster boundaries
        plot_clusters_with_boundaries(input_data_normalized, cluster_labels)

        return cluster_labels

    except Exception as ClusterFittingError:
        return repr(f"ClusterFittingError: {ClusterFittingError} while fitting clusters on the original dataset").replace('  ', '')


In [None]:
no_miss_data = sample_data.copy().fillna(-1)
fit_clustering_model_1(input_data  = no_miss_data, 
                       min_samples = 1)

In [None]:
sample_data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
values = [23, 24, 24, 21, 20, 22, 21, 24, 21, 23, 24, 24, 23, 24, 25, 24, 23, 25, 23, 25, 22, 22, 23, 26, 25, 24, 25, 22, 21, 21, 25, 26, 22, 22, 22, 22, 23, 21, 23, 24, 21, 23, 22, 22, 22, 26, 21, 23, 21, 22, 22, 21, 25, 24, 23, 26, 21, 27, 22, 22, 25, 20, 25, 23, 23, 21, 25, 23, 23, 23, 27, 24, 22, 24, 26, 24, 24, 20, 19, 27, 23, 22, 21, 24, 21, 21, 24, 23, 25, 22, 22, 23, 21, 25, 24, 23, 24, 21, 23, 25, 24, 24, 21, 23, 25, 23, 23, 24, 23, 26, 23, 19, 24, 24, 21, 22, 20, 24, 23, 23]

sns.distplot(values)


In [None]:
values_2 = [22, 25, 20, 18, 19, 21, 17, 16, 20, 17, 19, 20, 20, 19, 22, 21, 17, 19, 19, 21, 23, 20, 20, 20, 18, 19, 19, 19, 21, 19, 22, 20, 18, 24, 22, 21, 22, 22, 19, 19, 16, 19, 19, 19, 19, 19, 17, 20, 16, 20, 24, 17, 19, 20, 19, 19, 17, 22, 21, 17, 16, 21, 22, 17, 23, 21, 19, 20, 22, 22, 19, 21, 21, 18, 17, 21, 19, 19, 22, 23, 20, 21, 20, 18, 21, 21, 16, 17, 17, 19, 18, 21, 19, 25, 22, 23, 18, 19, 21, 22, 18, 19, 18, 18, 20, 18, 21, 20, 19, 19, 21, 22, 19, 23, 20, 20, 18, 19, 23, 18]

sns.distplot(values_2)


In [None]:
import numpy as np
import random

def generateStepsCountData(total_minutes, desired_sum, desired_range, percentage_zeros):
    np.random.seed(42)
    random.seed(42)
    # Calculate the number of zero values and non-zero values
    num_zeros         = int(total_minutes*percentage_zeros)
    
    num_non_zeros     = (total_minutes - num_zeros)
    
    # Generate random non-zero values that mimic activity patterns
    non_zero_values   = list()
    for i in range(num_non_zeros):
        tmp_sum        = 0
        non_zero_value = random.choice(np.arange(desired_range[0], desired_range[1], 1))
        tmp_sum        = sum(non_zero_values)+non_zero_value
        
        if (tmp_sum < desired_sum):
            non_zero_values.append(non_zero_value)
        else:
            break
    
    remaining_non_zero_values = (num_non_zeros - len(non_zero_values))
    if (remaining_non_zero_values > 0):
        final_non_zero_values = non_zero_values + [0] * remaining_non_zero_values
    else:
        pass
    
    # Generate zero values
    zero_values       = np.zeros(shape = num_zeros,
                                 dtype = int)
    
    # Combine zero and non-zero values
    sample            = np.concatenate((non_zero_values, zero_values),
                                       axis   = 0)
    
    # Shuffle the sample 
    np.random.shuffle(sample)
        
    final_sample = sample.copy()
    
    sum_sample   = sum(final_sample)
    while (sum_sample < desired_sum):
        for item in range(len(final_sample) - 1, -1, -1):
            if (final_sample[item] == 0):
                final_sample[item] = (desired_sum - sum_sample)
                break
            
        
        
    return sample


In [None]:
total_minutes    = 24 * 60 # 24 hours * 60 minutes
desired_sum      = 3500
desired_range    = (1, 150)
percentage_zeros = 0.45


In [None]:
step_count_data = generateStepsCountData(total_minutes    = total_minutes,
                                         desired_sum      = desired_sum,
                                         desired_range    = desired_range,
                                         percentage_zeros = percentage_zeros) 

In [None]:
step_count_data

In [16]:
sum(step_count_data)

NameError: name 'step_count_data' is not defined