In [1]:
!pip install tqdm pandas statsmodels openpyxl



In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
ROOT_FOLDER = "data/"

In [4]:
DATABASE_C_PAIRS = "output_data_c.xlsx"
DATABASE_OTHER_PAIRS = "output_data_c_others_all.xlsx"


In [5]:
from statsmodels.stats import inter_rater as irr

In [6]:
NUMBER_OF_CATEGORIES = 5

In [7]:
NUMBER_OF_RATERS = 2

In [8]:
def calculate_fleiss_kappa(drug_list):
    drug_list_len = len(drug_list)
    data_matrix = np.zeros(shape=(drug_list_len,NUMBER_OF_RATERS), dtype=int)
    for i in range(drug_list_len):
        entry = drug_list[i]
        for j in range(0, NUMBER_OF_RATERS):
            data_matrix[i,j] = entry[j+2]
    agg,cat = irr.aggregate_raters(data_matrix) # returns a tuple (data, categories)
    fleiss_score = irr.fleiss_kappa(agg, method = 'fleiss')
    #print(agg)
    #print(fleiss_score)
    if np.isnan(fleiss_score):
        return np.finfo(np.float64).max
    return fleiss_score

### Only C category interactions

In [9]:
output_df_c = pd.read_excel(ROOT_FOLDER+DATABASE_C_PAIRS, sheet_name = "output_data_c")

ValueError: Worksheet named '2 baze de date_ATC_all' not found

In [None]:
output_df_c

In [10]:
# Extract only category names
category_names_a = []
category_names_b = []
for index, row in tqdm(output_df_c.iterrows(), total = output_df_c.shape[0]):
    cat_a = row["ClassA"]
    cat_b = row["ClassB"]
    if cat_a not in category_names_a:
        category_names_a.append(cat_a)
    if cat_b not in category_names_b:
        category_names_b.append(cat_b);

NameError: name 'output_df_c' is not defined

In [11]:
category_names_a

[]

In [12]:
category_names_b

[]

In [13]:
category_pairs = []
for i in category_names_a:
    for j in category_names_b:
        if i == j:
            continue
        category_pairs.append((i,j))

In [14]:
category_pairs

[]

In [15]:
def search_dataframe(df, column, value):
    return df.loc[df[column].str.lower() == value.lower()]


In [16]:
def search_multiple_columns(df, col1, val1, col2, val2):
    res_col_1 = search_dataframe(df, col1, val1)
    return search_dataframe(res_col_1, col2,val2)

In [17]:
for item in category_pairs:
    drug_list = search_multiple_columns(output_df_c, "ClassA", item[0], "ClassB", item[1])
    drug_list_concatenated = []
    for index, row in drug_list.iterrows():
        drug_1 = row["DrugA"]
        drug_2 = row["DrugB"]

        drug_com = int(row["DrugsScore"])

        webmd = int(row["WebmdScore"])

        if drug_com == -1 or webmd == -1:
            continue

        if drug_com == 5:
            drug_com = 4    
        drug_list_concatenated.append((drug_1, drug_2,drug_com,webmd))
    try:    
        fleiss_score = calculate_fleiss_kappa(drug_list_concatenated)
        print(f"{item[0]}->{item[1]}: {fleiss_score}")
    except:
        print(f"{item[0]}->{item[1]}: NAN")

### C-> Others

In [18]:
output_df_c_others = pd.read_excel(ROOT_FOLDER+DATABASE_OTHER_PAIRS, sheet_name = "output_data_c_others_all")

In [19]:
output_df_c_others

Unnamed: 0.1,Unnamed: 0,ClassA,DrugA,ClassB,DrugB,DrugsScore,WebmdScore
0,0,C10,atorvastatin,B01,warfarin,1,0
1,1,C10,atorvastatin,B01,dalteparin,0,0
2,2,C10,atorvastatin,B01,enoxaparin,0,0
3,3,C10,atorvastatin,B01,heparin,0,0
4,4,C10,atorvastatin,B01,tinzaparin,0,0
...,...,...,...,...,...,...,...
19647,6495,C01,regadenoson,G04,Silodosin,0,0
19648,6496,C01,regadenoson,G04,Tamsulosin,0,0
19649,6497,C01,regadenoson,G04,Terazosin,0,0
19650,6498,C01,regadenoson,G04,Dutasteride,0,0


In [20]:
# Extract only category names
category_names_a = []
category_names_b = []
for index, row in tqdm(output_df_c_others.iterrows(), total = output_df_c_others.shape[0]):
    cat_a = row["ClassA"]
    cat_b = row["ClassB"]
    if cat_a not in category_names_a:
        category_names_a.append(cat_a)
    if cat_b not in category_names_b:
        category_names_b.append(cat_b);

100%|██████████████████████████████████████████████████████████████████████████| 19652/19652 [00:00<00:00, 28122.56it/s]


In [21]:
category_names_a

['C10', 'C09', 'C08', 'C07', 'C04', 'C03', 'C02', 'C01']

In [22]:
category_names_b

['B01',
 'M01',
 'M04',
 'A02',
 'A10',
 'A12',
 'N05BC',
 'N05A',
 'N06A',
 'N06BD',
 'G04']

In [23]:
category_pairs = []
for i in category_names_a:
    for j in category_names_b:
        if i == j:
            continue
        category_pairs.append((i,j))

In [24]:
category_pairs

[('C10', 'B01'),
 ('C10', 'M01'),
 ('C10', 'M04'),
 ('C10', 'A02'),
 ('C10', 'A10'),
 ('C10', 'A12'),
 ('C10', 'N05BC'),
 ('C10', 'N05A'),
 ('C10', 'N06A'),
 ('C10', 'N06BD'),
 ('C10', 'G04'),
 ('C09', 'B01'),
 ('C09', 'M01'),
 ('C09', 'M04'),
 ('C09', 'A02'),
 ('C09', 'A10'),
 ('C09', 'A12'),
 ('C09', 'N05BC'),
 ('C09', 'N05A'),
 ('C09', 'N06A'),
 ('C09', 'N06BD'),
 ('C09', 'G04'),
 ('C08', 'B01'),
 ('C08', 'M01'),
 ('C08', 'M04'),
 ('C08', 'A02'),
 ('C08', 'A10'),
 ('C08', 'A12'),
 ('C08', 'N05BC'),
 ('C08', 'N05A'),
 ('C08', 'N06A'),
 ('C08', 'N06BD'),
 ('C08', 'G04'),
 ('C07', 'B01'),
 ('C07', 'M01'),
 ('C07', 'M04'),
 ('C07', 'A02'),
 ('C07', 'A10'),
 ('C07', 'A12'),
 ('C07', 'N05BC'),
 ('C07', 'N05A'),
 ('C07', 'N06A'),
 ('C07', 'N06BD'),
 ('C07', 'G04'),
 ('C04', 'B01'),
 ('C04', 'M01'),
 ('C04', 'M04'),
 ('C04', 'A02'),
 ('C04', 'A10'),
 ('C04', 'A12'),
 ('C04', 'N05BC'),
 ('C04', 'N05A'),
 ('C04', 'N06A'),
 ('C04', 'N06BD'),
 ('C04', 'G04'),
 ('C03', 'B01'),
 ('C03', 'M01'),
 

In [25]:
for item in category_pairs:
    drug_list = search_multiple_columns(output_df_c_others, "ClassA", item[0], "ClassB", item[1])
    drug_list_concatenated = []
    for index, row in drug_list.iterrows():
        drug_1 = row["DrugA"]
        drug_2 = row["DrugB"]

        drug_com = int(row["DrugsScore"])

        webmd = int(row["WebmdScore"])

        if drug_com == -1 or webmd == -1:
            continue

        if drug_com == 5:
            drug_com = 4    
        drug_list_concatenated.append((drug_1, drug_2,drug_com,webmd))
    try:    
        fleiss_score = calculate_fleiss_kappa(drug_list_concatenated)
        print(f"{item[0]}->{item[1]}: {fleiss_score}")
    except:
        print(f"{item[0]}->{item[1]}: NAN")

C10->B01: 0.4742113169754617
C10->M01: 0.21214004936465808
C10->M04: 0.6144922363019812
C10->A02: 0.06831648658069107
C10->A10: 0.5117348743955653
C10->A12: 1.7976931348623157e+308
C10->N05BC: 0.25333333333333313
C10->N05A: -0.010132501948551883
C10->N06A: 0.08603667136812532
C10->N06BD: 0.6622889305816152
C10->G04: -0.020242914979752082
C09->B01: 0.440156562658047
C09->M01: 0.23675658744763312


  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


C09->M04: 0.801047120418848
C09->A02: -0.007836045810726314
C09->A10: 0.7958377574856657
C09->A12: -0.5704187058183795
C09->N05BC: -0.27611818828226065
C09->N05A: -0.38088829071332414
C09->N06A: -0.21005586592178724
C09->N06BD: 1.7976931348623157e+308
C09->G04: 0.9108997119313993
C08->B01: 0.10750234867803046
C08->M01: -0.31143921622740445
C08->M04: 0.5319148936170212
C08->A02: 0.4618235730170497
C08->A10: 0.4549577640455813
C08->A12: 1.7976931348623157e+308
C08->N05BC: -0.29351946010908747
C08->N05A: -0.39025134832604014
C08->N06A: -0.006901561932437382
C08->N06BD: -0.23066798288910828
C08->G04: 0.11812827225130877
C07->B01: 0.07087144616485792
C07->M01: 1.0
C07->M04: 1.0
C07->A02: 0.12194522903107051
C07->A10: 0.16147765768672373
C07->A12: -0.3000000000000003
C07->N05BC: -0.16374187772993168
C07->N05A: -0.44866997754949306
C07->N06A: -0.18948932509580071
C07->N06BD: -0.18264520643806847
C07->G04: 0.9781171095346881
C04->B01: -0.12784118154790436
C04->M01: -0.09848794442174093
C04->M0