In [22]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist

In [23]:
def one_hot_encode(sequence, alphabet_size):
    """
    Convert a sequence into a one-hot encoded matrix.

    Args:
        sequence: List of integers (e.g., [0, 1, 2, 3])
        alphabet_size: Total number of unique symbols (e.g., 4 for [0, 1, 2, 3])

    Returns:
        A numpy array of shape (len(sequence), alphabet_size)
    """
    one_hot_matrix = np.zeros((len(sequence), alphabet_size), dtype=int)
    for idx, value in enumerate(sequence):
        one_hot_matrix[idx, value] = 1
    return one_hot_matrix

In [3]:
df = pd.read_csv("../../dataset/df_cleaned_1atc.tsv", sep="\t")
df["drug_era_start_date"] = pd.to_datetime(df["drug_era_start_date"])
df["drug_era_end_date"] = pd.to_datetime(df["drug_era_end_date"])
df_n06a = (
    df[df["atc_level3"] == "N06A"]
    .sort_values(by=["eid", "drug_era_start_date"])
    .reset_index(drop=True)
)
df_n06a.head()

Unnamed: 0,eid,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code,duration,atc_level3
0,1000014,128849065239,710062,2010-02-15,2010-02-26,2,0,amitriptyline,N06AA09,12,N06A
1,1000014,1245540543307,710062,2010-04-23,2010-04-30,1,0,amitriptyline,N06AA09,8,N06A
2,1000014,670014960364,710062,2010-06-25,2010-07-02,1,0,amitriptyline,N06AA09,8,N06A
3,1000014,1228360666726,710062,2014-03-10,2014-03-17,1,0,amitriptyline,N06AA09,8,N06A
4,1000041,661424994579,797617,2014-02-14,2014-02-27,1,0,citalopram,N06AB04,14,N06A


In [4]:
# create a mapping for each atc_code to a number
atc_code_to_num = {
    atc_code: i + 1 for i, atc_code in enumerate(df_n06a["atc_code"].unique())
}
df_n06a["atc_code_num"] = df_n06a["atc_code"].map(atc_code_to_num)
df_n06a.head()

Unnamed: 0,eid,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code,duration,atc_level3,atc_code_num
0,1000014,128849065239,710062,2010-02-15,2010-02-26,2,0,amitriptyline,N06AA09,12,N06A,1
1,1000014,1245540543307,710062,2010-04-23,2010-04-30,1,0,amitriptyline,N06AA09,8,N06A,1
2,1000014,670014960364,710062,2010-06-25,2010-07-02,1,0,amitriptyline,N06AA09,8,N06A,1
3,1000014,1228360666726,710062,2014-03-10,2014-03-17,1,0,amitriptyline,N06AA09,8,N06A,1
4,1000041,661424994579,797617,2014-02-14,2014-02-27,1,0,citalopram,N06AB04,14,N06A,2


In [21]:
alphabet_size = len(atc_code_to_num) + 1
print(f"alphabet size: {alphabet_size}")

alphabet size: 10


In [5]:
# create sequences for each eid
def eid_to_sequence(eid_df: pd.DataFrame):
    # Sort by start date
    eid_df = eid_df.sort_values("drug_era_start_date")

    # Initialize sequence
    sequence = []

    # Get the first and last date to establish timeline
    start_date = eid_df["drug_era_start_date"].min()
    end_date = eid_df["drug_era_end_date"].max()

    # Create a dictionary of dates and their corresponding ATC codes
    date_dict = {}
    for _, row in eid_df.iterrows():
        dates = pd.date_range(row["drug_era_start_date"], row["drug_era_end_date"])
        for date in dates:
            if date in date_dict:
                if isinstance(date_dict[date], list):
                    date_dict[date].append(row["atc_code_num"])
                else:
                    date_dict[date] = [date_dict[date], row["atc_code_num"]]
            else:
                date_dict[date] = row["atc_code_num"]

    # Create sequence for all dates
    all_dates = pd.date_range(start_date, end_date)
    for date in all_dates:
        if date in date_dict:
            # If multiple drugs on same day, use the first one
            # (you could modify this to handle multiple drugs differently)
            value = date_dict[date]
            if isinstance(value, list):
                sequence.append(value[0])
            else:
                sequence.append(value)
        else:
            pass  # ignore dates with no drug
            # sequence.append(0)

    return sequence


eids = np.random.choice(df_n06a["eid"].unique(), 1000)
sequences = [eid_to_sequence(df_n06a[df_n06a["eid"] == eid]) for eid in eids]
sequence_lengths = [len(sequence) for sequence in sequences]

print(f"average sequence length: {np.mean(sequence_lengths)}")
print(f"std sequence length: {np.std(sequence_lengths)}")
print(f"max sequence length: {np.max(sequence_lengths)}")
print(f"min sequence length: {np.min(sequence_lengths)}")

average sequence length: 825.983
std sequence length: 1299.2654819978095
max sequence length: 7382
min sequence length: 1


In [39]:
import numpy as np


def one_hot_encode(sequence, alphabet_size):
    """
    Convert a sequence into a one-hot encoded matrix based on unique values.

    Args:
        sequence: List of integers (e.g., [0, 1, 2, 3]).
        unique_values: List or set of all unique values across both sequences.

    Returns:
        A numpy array of shape (len(sequence), len(unique_values)).
    """
    unique_values = list(range(alphabet_size))
    value_to_index = {value: idx for idx, value in enumerate(unique_values)}
    one_hot_matrix = np.zeros((len(sequence), len(unique_values)), dtype=int)
    for idx, value in enumerate(sequence):
        one_hot_matrix[idx, value_to_index[value]] = 1
    return one_hot_matrix


def cosine_similarity(vec1, vec2):
    """
    Compute the cosine similarity between two vectors.

    Args:
        vec1, vec2: Input vectors (1D arrays or matrices).

    Returns:
        Cosine similarity between the vectors.
    """
    dot_product = np.dot(vec1.ravel(), vec2.ravel())
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0  # Handle zero-magnitude vectors
    return dot_product / (norm_vec1 * norm_vec2)


def sliding_window_cosine_similarity(sequence1, sequence2, alphabet_size):
    """
    Compute the cosine similarity between the shorter sequence and every possible
    subsequence of the longer sequence using a sliding window approach.

    Args:
        sequence1: List or array-like, first sequence.
        sequence2: List or array-like, second sequence.

    Returns:
        A list of cosine similarity values for each subsequence of the longer sequence.
    """
    # Determine the shorter and longer sequence
    if len(sequence1) <= len(sequence2):
        shorter = sequence1
        longer = sequence2
    else:
        shorter = sequence2
        longer = sequence1

    # One-hot encode the sequences
    one_hot_shorter = one_hot_encode(shorter, alphabet_size)
    one_hot_longer = one_hot_encode(longer, alphabet_size)
    # Length of the sliding window (same as the shorter sequence)
    window_size = len(shorter)

    # Collect cosine similarities for each sliding window
    similarities = []
    for i in range(len(longer) - window_size + 1):
        # Extract the current subsequence (one-hot encoded)
        one_hot_subsequence = one_hot_longer[i : i + window_size, :]

        # Compute cosine similarity between the shorter sequence and the subsequence
        similarity = cosine_similarity(one_hot_shorter, one_hot_subsequence)
        similarities.append(similarity)

    return max(similarities)


# Example Usage
sequence1 = [0, 1, 2]  # First sequence
sequence2 = [2, 0, 1, 2, 3]  # Second sequence

# Compute cosine similarities
similarities = sliding_window_cosine_similarity(sequence1, sequence2, 4)

print("Cosine similarities for each subsequence of the longer sequence:", similarities)


Cosine similarities for each subsequence of the longer sequence: 1.0000000000000002


In [44]:
# Example Usage
sequence1 = sequences[10]
sequence2 = sequences[12]

# Compute cosine similarities
similarities = sliding_window_cosine_similarity(sequence1, sequence2, alphabet_size)

print("Cosine similarities for each subsequence of the longer sequence:", similarities)


Cosine similarities for each subsequence of the longer sequence: 1.0000000000000002
