In [154]:
import csv
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
dataset_path = "/home/samuel/NYU/BDS/project/meddra/"

def read_tsv_file(file_path, headers = True):
    try:
        data = []
        with open(file_path, 'r', newline='', encoding='utf-8') as file:
            # Using the CSV reader with the tab delimiter
            reader = csv.reader(file, delimiter='\t')

            if headers == True:
                # Reading headers
                headers = next(reader)
                print("Headers:", headers)

            # Reading data rows
            for row in reader:
                data.append(row)

        # Create a Pandas DataFrame using the headers and data
        if headers == True:
            df = pd.DataFrame(data, columns=headers)
        else:
            df = pd.DataFrame(data)
        return df

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
    except Exception as e:
        print(f"An error occurred: {e}")

In [46]:
CID_df = pd.read_csv("../sider_extract/drug_names_with_CID.csv")
dataset_path = "/home/samuel/NYU/BDS/project/meddra/"
meddra_file_path = dataset_path + 'meddra_all_se.tsv'
meddra_df = read_tsv_file(meddra_file_path, headers = False)
review_df = pd.read_csv("../review_dataset/drugsComTrain_processed.csv")


In [67]:
side_effects_df = pd.read_csv("../sider_extract/side_effects.csv").drop(columns="Unnamed: 0").drop_duplicates()

In [68]:
side_effects_df[side_effects_df["drug_name"] == "Levonorgestrel"]

Unnamed: 0,drug_name,side_effect
0,Levonorgestrel,Abdominal distension
2,Levonorgestrel,Abdominal pain
3,Levonorgestrel,Gastrointestinal pain
5,Levonorgestrel,Acute tonsillitis
7,Levonorgestrel,Alopecia
...,...,...
395,Levonorgestrel,Breast neoplasm
397,Levonorgestrel,Somnolence
399,Levonorgestrel,Vaginal haemorrhage
401,Levonorgestrel,Abnormal vision


In [69]:
len(review_df)

161297

In [240]:
def extract_age(review):
    review = review.lower()
    if "years old" in review:
        age_index = review.find("years old")
        match = re.search(r"\b\d+\b", review[age_index-3:age_index+10])
        if match:
            return match.group()
            
    if "year-old" in review:
        age_index = review.find("year-old")
        match = re.search(r"\b\d+\b", review[age_index-3:age_index+10])
        if match:
            return match.group()

    if "years-old" in review:
        age_index = review.find("years-old")
        match = re.search(r"\b\d+\b", review[age_index-3:age_index+10])
        if match:
            return match.group()
    
    if "year old" in review:
        age_index = review.find("year old")
        match = re.search(r"\b\d+\b", review[age_index-3:age_index+10])
        if match:
            return match.group()
    
    if "y/o" in review:
        age_index = review.find("y/o")
        match = re.search(r"\b\d+\b", review[age_index-3:age_index+10])
        if match:
            return match.group()
            
    if "years of age" in review:
        age_index = review.find("years of age")
        match = re.search(r"\b\d+\b", review[age_index-3:age_index+10])
        if match:
            return match.group()
    
    return -1

def extract_married(review):
    if "husband" in review:
        return True
    
    if "wife" in review:
        return True
    
    if "married" in review:
        return True
    
    if "marriage" in review:
        return True
    
    if "spouse" in review:
        return True
    return False

def extract_children(review):
    if "child" in review:
        return True
    
    if "children" in review:
        return True
    
    if "kid" in review:
        return True
    
    if "kids" in review:
        return True
    return False

def extract_male(review):
    if " male" in review:
        # parts = review.split(" male", 1)  # Split the string into two parts at the first occurrence of " male"
        # print("First Part:", parts[0])
        # print("Male")
        # print("Second Part:", parts[1])
        # print("\n\n\n")
        return True
    return False

def extract_female(review):
    if "female" in review:
        return True
    return False
    


In [248]:
def create_patient_matrix(drug_name, review_df, side_effect_df):
    drug_name_review_df = review_df[review_df["drugName"] == drug_name]
    drug_name_side_effect_df = side_effect_df[side_effect_df["drug_name"] == drug_name]
    print(f"Size of review dataset: {len(drug_name_review_df)}")
    print(f"Number of side effects: {len(drug_name_side_effect_df)}")
    
    drug_name_side_effect_df.loc[:,"count"] = 0
    
    for review in drug_name_review_df["review"]:
        for side_effect_name in drug_name_side_effect_df["side_effect"]:
            if side_effect_name.lower() in review.lower():
                drug_name_side_effect_df.loc[drug_name_side_effect_df["side_effect"] == side_effect_name, "count"] += 1
        
    relevant_side_effects = list(drug_name_side_effect_df[drug_name_side_effect_df["count"] > 5]["side_effect"])
    
    for relevant_effect in relevant_side_effects:
        drug_name_review_df.loc[:, relevant_effect] = False
        for index,  review in drug_name_review_df["review"].items():
            if relevant_effect.lower() in review.lower():
                drug_name_review_df.loc[index, relevant_effect] = True

    patient_matrix = drug_name_review_df[["drugName", "condition", "rating"]+relevant_side_effects].rename(columns={"drugName":"drug_name"})
    patient_matrix.loc[:,"children"] = False
    patient_matrix.loc[:,"married"] = False
    patient_matrix.loc[:,"female"] = False
    patient_matrix.loc[:,"male"] = False

    for index, patient in drug_name_review_df.iterrows():
        review = patient["review"].lower()
        age = extract_age(review)
        if age != -1:
            patient_matrix.loc[index, "age"] = age
        
        patient_matrix.loc[index, "married"] = extract_married(review)
        patient_matrix.loc[index, "children"] = extract_children(review)
        patient_matrix.loc[index, "male"] = extract_male(review)
        if extract_male(review) == True:
            print("male")
        patient_matrix.loc[index, "female"] = extract_female(review)


    

    return patient_matrix, drug_name_review_df, relevant_side_effects

patient_matrix, drug_name_review_df, relevant_side_effects = create_patient_matrix("Levonorgestrel", review_df=review_df, side_effect_df=side_effects_df)

Size of review dataset: 3657
Number of side effects: 220
male
male


In [262]:
def collaborative_filtering(patient_matrix, drug_name_review_df, relevant_side_effects, query_patient = None):
    collaborative_filtering_df = patient_matrix.drop(columns=relevant_side_effects+["drug_name","condition","rating"])
    collaborative_filtering_df = collaborative_filtering_df[collaborative_filtering_df["age"] != "na"]
    label_encoder = LabelEncoder()
    for col in collaborative_filtering_df.columns:
        if collaborative_filtering_df[col].dtype == 'object':
            collaborative_filtering_df[col] = pd.to_numeric(collaborative_filtering_df[col])
    collaborative_filtering_df_norm = (collaborative_filtering_df - collaborative_filtering_df.mean()) / collaborative_filtering_df.std()
    
    all_nan_columns = collaborative_filtering_df_norm.columns[collaborative_filtering_df_norm.isna().all()]
    collaborative_filtering_df_norm[all_nan_columns] = collaborative_filtering_df_norm[all_nan_columns].fillna(0)
    
    if query_patient == None:
        query_patient = collaborative_filtering_df_norm.iloc[0]
        
    cosine_similarities = cosine_similarity(collaborative_filtering_df_norm, [query_patient])
    
    
    collaborative_filtering_df["similarity"] = 0.0
     
    for idx, similarity in enumerate(cosine_similarities):
        collaborative_filtering_df.iloc[idx,collaborative_filtering_df.columns.get_loc("similarity")] = similarity
    
    similar_patients = pd.merge(patient_matrix, collaborative_filtering_df.drop(columns=["age", "married", "children"]), left_index = True, right_index = True).sort_values(by="similarity", ascending = False)
    likely_side_effects = similar_patients.head(20)[relevant_side_effects].sum().sort_values(ascending=False)
    
    print(likely_side_effects)
    return patient_matrix, collaborative_filtering_df   
    
new_patient_matrix, collaborative_filtering_df = collaborative_filtering(patient_matrix, drug_name_review_df, relevant_side_effects)


Pain                 13
Nausea                2
Discomfort            1
Pregnancy             1
Migraine              1
Depression            1
Tension               1
Redness               1
Dizziness             1
Tenderness            0
Breast tenderness     0
Pelvic pain           0
Mood swings           0
Vomiting              0
Shock                 0
Ectopic pregnancy     0
Crying                0
Acne                  0
Polyp                 0
Abdominal pain        0
Anxiety               0
Breast pain           0
Leg pain              0
Headache              0
Fatigue               0
Rash                  0
Endometriosis         0
Cyst                  0
Cough                 0
Ovarian cyst          0
Constipation          0
Infection             0
Back pain             0
Insomnia              0
dtype: int64


In [251]:
print(collaborative_filtering_df.sort_values(by="similarity", ascending = False))

        children  married  female   male  age  similarity
465        False    False   False  False   20    1.000000
156167     False    False   False  False   20    1.000000
27696      False    False   False  False   20    1.000000
38769      False    False   False  False   20    1.000000
19001      False    False   False  False   20    1.000000
...          ...      ...     ...    ...  ...         ...
15486       True    False   False  False   28   -0.889120
134424      True    False   False  False   30   -0.889308
39763       True    False   False  False   29   -0.891541
11439       True    False   False  False   29   -0.891541
149185      True    False   False  False   29   -0.891541

[259 rows x 6 columns]


In [252]:
similar_patients = pd.merge(patient_matrix, collaborative_filtering_df.drop(columns=["age", "married", "children"]), left_index = True, right_index = True).sort_values(by="similarity", ascending = False)

In [253]:
similar_patients.head(20)[relevant_side_effects].sum().sort_values(ascending=False)

Pain                 13
Nausea                2
Discomfort            1
Pregnancy             1
Migraine              1
Depression            1
Tension               1
Redness               1
Dizziness             1
Tenderness            0
Breast tenderness     0
Pelvic pain           0
Mood swings           0
Vomiting              0
Shock                 0
Ectopic pregnancy     0
Crying                0
Acne                  0
Polyp                 0
Abdominal pain        0
Anxiety               0
Breast pain           0
Leg pain              0
Headache              0
Fatigue               0
Rash                  0
Endometriosis         0
Cyst                  0
Cough                 0
Ovarian cyst          0
Constipation          0
Infection             0
Back pain             0
Insomnia              0
dtype: int64

In [214]:
drug_name_review_df.loc[21161]["review"]

