In [1]:
!pip install tqdm pandas statsmodels openpyxl

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/de/ce/b5d9c7ce1aaf9023b823c81932a50cd5e8f407198a696b0d1c6025a40b03/pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting statsmodels
  Downloading statsmodels-0.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
ROOT_FOLDER = "data/"

In [6]:
DATABASE_C_PAIRS = "output_data_c_new.xlsx"
DATABASE_OTHER_PAIRS = "output_data_c_others_all.xlsx"
SHEET_NAME = "2 baze de date_ATC_all"

In [7]:
from statsmodels.stats import inter_rater as irr

## Set the number of categories**

1. 4=X=major--contraindicated combination of drugs
2. 3=D=major--avoid, use alternative drug
3. 2=C=major/moderate--monitor patient's therapy
4. 1=B=minor--no action needed
5. 0=no interaction found
6. -1 = no record

## Mapping in parsers:
    
**Drugs.com**
1. Not Found = 0
2. Minor = 1
3. Moderate = 2
4. Major = 3
5. Avoid = 4
6. Contraindicated = 5

**Webmd**
1. Not Found = 0
2. Minor = 1
3. Monitor CLosely = 2
4. Serios = 3
5. Don't use = 4


*NOTE*: For drugs.com, category 5 is merged into 4!



In [8]:
NUMBER_OF_CATEGORIES = 5

In [9]:
NUMBER_OF_RATERS = 2

In [10]:
def calculate_fleiss_kappa(drug_list):
    drug_list_len = len(drug_list)
    data_matrix = np.zeros(shape=(drug_list_len,NUMBER_OF_RATERS), dtype=int)
    for i in range(drug_list_len):
        entry = drug_list[i]
        for j in range(0, NUMBER_OF_RATERS):
            data_matrix[i,j] = entry[j+2]
    agg,cat = irr.aggregate_raters(data_matrix) # returns a tuple (data, categories)
    fleiss_score = irr.fleiss_kappa(agg, method = 'fleiss')
    #print(agg)
    #print(fleiss_score)
    if np.isnan(fleiss_score):
        return np.finfo(np.float64).max
    return fleiss_score

### Only C category interactions

In [32]:
output_df_c = pd.read_excel(ROOT_FOLDER+DATABASE_C_PAIRS, sheet_name = SHEET_NAME)

In [33]:
output_df_c

Unnamed: 0.1,Unnamed: 0,ClassA,DrugA,ClassB,DrugB,DrugsScore,WebmdScore
0,0,C10,atorvastatin,C09,benazepril,0,0
1,1,C10,atorvastatin,C09,captopril,0,0
2,2,C10,atorvastatin,C09,enalapril,0,0
3,3,C10,atorvastatin,C09,fosinopril,0,0
4,4,C10,atorvastatin,C09,lisinopril,0,0
...,...,...,...,...,...,...,...
7654,8108,C02,doxazosin,C01,indomethacin,0,2
7655,8109,C02,doxazosin,C01,ivabradine,0,0
7656,8110,C02,doxazosin,C01,mavacamten,2,0
7657,8111,C02,doxazosin,C01,ranolazine,0,0


In [34]:
drug_list_concatenated = []
for index, row in tqdm(output_df_c.iterrows(), total=output_df_c.shape[0]):
    drug_1 = row["DrugA"]
    drug_2 = row["DrugB"]

    drug_com = int(row["DrugsScore"])

    webmd = int(row["WebmdScore"])

    if drug_com == -1 or webmd == -1:
        continue

    if drug_com == 5:
        drug_com = 4
        
    

    drug_list_concatenated.append((drug_1, drug_2,drug_com,webmd))
print(f"Length: {len(drug_list_concatenated)}")

100%|████████████████████████████████████████████████████████████████████████████| 7659/7659 [00:00<00:00, 22907.66it/s]

Length: 7659





In [35]:
fleiss_score = calculate_fleiss_kappa(drug_list_concatenated)
print(f"{fleiss_score}")

0.35715110391259725


### C-Others categories interactions

In [11]:
output_df_others = pd.read_excel(ROOT_FOLDER+DATABASE_OTHER_PAIRS, sheet_name = SHEET_NAME)

In [12]:
output_df_others

Unnamed: 0.1,Unnamed: 0,ClassA,DrugA,ClassB,DrugB,DrugsScore,WebmdScore
0,0,C10,atorvastatin,B01,warfarin,1,0
1,1,C10,atorvastatin,B01,dalteparin,0,0
2,2,C10,atorvastatin,B01,enoxaparin,0,0
3,3,C10,atorvastatin,B01,heparin,0,0
4,4,C10,atorvastatin,B01,tinzaparin,0,0
...,...,...,...,...,...,...,...
19647,6495,C01,regadenoson,G04,Silodosin,0,0
19648,6496,C01,regadenoson,G04,Tamsulosin,0,0
19649,6497,C01,regadenoson,G04,Terazosin,0,0
19650,6498,C01,regadenoson,G04,Dutasteride,0,0


In [13]:
drug_list_concatenated_others = []
for index, row in tqdm(output_df_others.iterrows(), total=output_df_others.shape[0]):
    drug_1 = row["DrugA"]
    drug_2 = row["DrugB"]

    drug_com = int(row["DrugsScore"])

    webmd = int(row["WebmdScore"])

    if drug_com == -1 or webmd == -1:
        continue

    if drug_com == 5:
        drug_com = 4
        
    

    drug_list_concatenated_others.append((drug_1, drug_2,drug_com,webmd))
print(f"Length: {len(drug_list_concatenated_others)}")

100%|██████████████████████████████████████████████████████████████████████████| 19652/19652 [00:00<00:00, 22469.76it/s]

Length: 19652





In [14]:
fleiss_score = calculate_fleiss_kappa(drug_list_concatenated_others)
print(f"{fleiss_score}")

0.33628733011357304
