In [48]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

gad_dataset = pd.read_csv('./GAD_merged_samples_mesh.csv')#,encoding = "ISO-8859-1",sep="\t")

In [49]:
gad_dataset.head()

Unnamed: 0,GAD_ID,associationType,geneSymbol,GAD_GENE_NAME,geneId,gene_mention,GENE_ENTITY_OFFSET,diseaseName,disease_mention,DISEASE_ENTITY_OFFSET,raw_sentence,diseaseId
0,116326,Y,AGTR1,"Angiotensin II receptor, type 1",185,AT1R,80#84,"atherosclerosis, coronary",CAD,159#162,This study indicates a synergistic contributio...,MESH:D003324
1,588219,F,PALB2,partner and localizer of BRCA2,79728,PALB2,4#9,breast cancer,mutation,19#27,The PALB2 1592delT mutation has a strong effec...,
2,127842,Y,IL1A,"Interleukin 1, alpha",3552,IL-1,30#34,osteoarthritis,OA,113#115,Our findings suggest that the IL-1 gene cluste...,MESH:D010003
3,154807,F,TPMT,Thiopurine S-methyltransferase,7172,TPMT,26#30,"azathioprine toxicity hepatitis, autoimmune",fibrosis,9#17,Advanced fibrosis but not TPMT genotype or act...,MESH:D005355
4,121481,F,DRD2,Dopamine receptor D2,1813,light,7#12,schizophrenia; schizoaffective disorder; affec...,finding,21#28,"In the light of this finding, A2A2 DRD2 genoty...",


In [50]:
gad_dataset.associationType.value_counts()

associationType
F    2529
Y    1833
N     967
P       1
Name: count, dtype: int64

In [51]:
gad_dataset[gad_dataset['raw_sentence'].str.contains("These results suggest that the C1772T")]

Unnamed: 0,GAD_ID,associationType,geneSymbol,GAD_GENE_NAME,geneId,gene_mention,GENE_ENTITY_OFFSET,diseaseName,disease_mention,DISEASE_ENTITY_OFFSET,raw_sentence,diseaseId
1468,125111,F,HIF1A,"Hypoxia-inducible factor 1, alpha subunit (bas...",3091,HIF-1alpha,54#64,colorectal cancer,progression,84#95,These results suggest that the C1772T polymorp...,MESH:D018450
3609,125111,N,HIF1A,"Hypoxia-inducible factor 1, alpha subunit (bas...",3091,HIF-1alpha,54#64,colorectal cancer,colorectal carcinoma,113#133,These results suggest that the C1772T polymorp...,MESH:D015179


In [52]:
df =gad_dataset.copy()

In [84]:
def process_predicted_output(value):
    if pd.isnull(value):
        return np.nan
    value = str(value).strip()  # Remove leading/trailing whitespace
    if value.lower() in ['true', 'true.']:  # Check for 'True' or 'True.' (case insensitive)
        return True
    elif value.lower() == 'false':  # Check for 'False' (case insensitive)
        return False
    else:
        return np.nan  # If the value is anything else, return NaN


In [85]:
df_2 = pd.read_csv("/home/016651544/llama2/test_df_10000_25_FT_model_full_test.csv")
df_2['predicted_output'] = df_2['predicted_output'].apply(process_predicted_output)


In [86]:
df_2.columns

Index(['sentence', 'actual_output', 'predicted_output'], dtype='object')

In [87]:
df_2.predicted_output.value_counts()

predicted_output
True     2796
False    2533
Name: count, dtype: int64

In [127]:
df_2.head()

Unnamed: 0,sentence,actual_output,predicted_output,matched_sentence
0,These results suggest that the C1772T polymorp...,True,True,These results suggest that the C1772T polymorp...
1,"In our setting, @DISEASE$ among alcoholic indi...",True,True,"In our setting, iron overload among alcoholic ..."
2,MPO genotype GG is associated with @DISEASE$ i...,True,True,MPO genotype GG is associated with cirrhosis i...
3,These three studies do not provide consistent ...,True,True,These three studies do not provide consistent ...
4,Our prospective findings suggest that individu...,True,True,Our prospective findings suggest that individu...


In [132]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming gemma_gad_out is your DataFrame and it contains the 'actual_output' and 'predicted_output' columns
actual = df_2['actual_output'].astype(str)
predicted = df_2['predicted_output'].astype(str)

# Calculate Precision, Recall, and F1 score
precision = precision_score(actual, predicted, average='macro')
recall = recall_score(actual, predicted, average='macro')
f1 = f1_score(actual, predicted, average='macro')

precision, recall, f1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9979375738150748, 0.9977485928705441, 0.9978423692049855)

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Initialize tf-idf vectorizer
vectorizer = TfidfVectorizer()

# Combine both dataframes into one for vectorization
combined_sentences = df['raw_sentence'].tolist() + df_2['sentence'].tolist()
tfidf_matrix = vectorizer.fit_transform(combined_sentences)

# Compute cosine similarities (the matrix is asymmetrical due to sentence splitting)
cosine_similarities = cosine_similarity(tfidf_matrix[:len(df)], tfidf_matrix[len(df):])

# Threshold for accepting matches
threshold = 0.65

# For each sentence in df_2, find the best matching sentence in df
matched_sentences = []
for idx, row in df_2.iterrows():
    # Get the cosine similarity scores for this sentence against all in df
    similarity_scores = cosine_similarities[:, idx]

    # Find the index with the highest similarity score
    best_match_idx = np.argmax(similarity_scores)

    # Check if the best score is above the threshold
    if similarity_scores[best_match_idx] > threshold:
        matched_sentences.append(df['raw_sentence'].iloc[best_match_idx])
    else:
        matched_sentences.append(None)

# Assign the matched sentences to the new column in df_2
df_2['matched_sentence'] = matched_sentences


In [90]:
# Function to mask entities in a sentence
def mask_entities(sentence, gene_mention, gene_offsets, disease_mention, disease_offsets):
    # Parse entity offsets
    gene_start, gene_end = map(int, gene_offsets.split('#'))
    disease_start, disease_end = map(int, disease_offsets.split('#'))

    # Sort entities by their position to avoid offset issues when replacing
    entities = sorted(
        [(gene_mention, gene_start, gene_end, 'Gene'), 
         (disease_mention, disease_start, disease_end, 'Disease')], 
        key=lambda x: x[1], reverse=True
    )
    
    # Replace entities starting from the end of the sentence
    for text, start, end, etype in entities:
        mask = '$GENE$' if etype == 'Gene' else '$DISEASE$'
        sentence = sentence[:start] + mask + sentence[end:]
        
    return sentence

# Apply the masking function to each row
df['SENTENCE_MASKED'] = df.apply(lambda x: mask_entities(
    x['raw_sentence'], 
    x['gene_mention'], x['GENE_ENTITY_OFFSET'], 
    x['disease_mention'], x['DISEASE_ENTITY_OFFSET']
), axis=1)

# Assuming you want to view the specific columns mentioned
df[['raw_sentence', 'SENTENCE_MASKED', 'associationType']]

Unnamed: 0,raw_sentence,SENTENCE_MASKED,associationType
0,This study indicates a synergistic contributio...,This study indicates a synergistic contributio...,Y
1,The PALB2 1592delT mutation has a strong effec...,The $GENE$ 1592delT $DISEASE$ has a strong eff...,F
2,Our findings suggest that the IL-1 gene cluste...,Our findings suggest that the $GENE$ gene clus...,Y
3,Advanced fibrosis but not TPMT genotype or act...,Advanced $DISEASE$ but not $GENE$ genotype or ...,F
4,"In the light of this finding, A2A2 DRD2 genoty...","In the $GENE$ of this $DISEASE$, A2A2 DRD2 gen...",F
...,...,...,...
5325,Significant differences in the iNOS promoter p...,Significant differences in the $GENE$ promoter...,Y
5326,We could not replicate that DLG5 is a relevant...,We could not replicate that $GENE$ is a releva...,F
5327,The results suggest that eNOS polymorphisms (e...,The results suggest that $GENE$ polymorphisms ...,F
5328,Despite the strikingly similar pathologies of ...,Despite the strikingly similar pathologies of ...,N


In [91]:
euadr_unmasked = pd.concat([df[['raw_sentence', 'SENTENCE_MASKED',"associationType","gene_mention","disease_mention"]], df_2[["actual_output","predicted_output"]]], axis = 1)#.rename(columns = {"sentence":"SENTENCE"})],axis = 0 )
# pd.merge(df[['SENTENCE', 'SENTENCE_MASKED',"ASSOCIATION_TYPE"]], df_2.rename(columns = {"sentence":"SENTENCE"}), on =['SENTENCE'], how='left' )

In [92]:
euadr_unmasked.head()

Unnamed: 0,raw_sentence,SENTENCE_MASKED,associationType,gene_mention,disease_mention,actual_output,predicted_output
0,This study indicates a synergistic contributio...,This study indicates a synergistic contributio...,Y,AT1R,CAD,True,True
1,The PALB2 1592delT mutation has a strong effec...,The $GENE$ 1592delT $DISEASE$ has a strong eff...,F,PALB2,mutation,True,True
2,Our findings suggest that the IL-1 gene cluste...,Our findings suggest that the $GENE$ gene clus...,Y,IL-1,OA,True,True
3,Advanced fibrosis but not TPMT genotype or act...,Advanced $DISEASE$ but not $GENE$ genotype or ...,F,TPMT,fibrosis,True,True
4,"In the light of this finding, A2A2 DRD2 genoty...","In the $GENE$ of this $DISEASE$, A2A2 DRD2 gen...",F,light,finding,True,True


In [93]:
euadr_unmasked.to_csv("./gad_unmasked.csv",index=False)

In [94]:
euadr_unmasked.groupby(['associationType', 'predicted_output']).size().unstack(fill_value=0)
# unmasked-> entities -> mask -> llama2/gemma -> output


predicted_output,False,True
associationType,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1181,1348
N,467,500
P,1,0
Y,884,948


In [44]:
# test_df = pd.DataFrame({
#     'actual_output': ['True', 'False', 'TRUE', 'FALSE', 'true', 'false'],
#     'predicted_output': ['True', 'False', 'TRUE', 'FALSE', 'true', 'false']
# })

# # Normalize case to lower and then convert to integer
# test_df['actual'] = (test_df['actual_output'].str.lower() == 'true').astype(int)
# test_df['predicted'] = (test_df['predicted_output'].str.replace(r"[^TrueFalse]", "", regex=True).str.lower() == 'true').astype(int)


In [142]:
# test_df


In [287]:
import pandas as pd

# Assuming 'df' is your DataFrame and 'seed' is defined somewhere as your random seed
seed = 0

# Function to sample the data based on a fixed number of samples
def sample_data_fixed(df, n_samples):
    n_true = int(n_samples * 0.7)
    n_false = n_samples - n_true

    true_samples = df[df['predicted_output'] == True].sample(n=min(n_true, len(df[df['predicted_output'] == True])), random_state=seed)
    false_samples = df[df['predicted_output'] == False].sample(n=min(n_false, len(df[df['predicted_output'] == False])), random_state=seed)

    # If there aren't enough True or False samples, fill with the other type
    if len(true_samples) < n_true:
        additional_samples = df[df['predicted_output'] == False].sample(n=(n_true - len(true_samples)), random_state=seed)
        true_samples = pd.concat([true_samples, additional_samples])
    elif len(false_samples) < n_false:
        additional_samples = df[df['predicted_output'] == True].sample(n=(n_false - len(false_samples)), random_state=seed)
        false_samples = pd.concat([false_samples, additional_samples])
    
    print("# of True and False samples :",len(true_samples), len(false_samples), round(len(true_samples)/n_samples,2)*100, round(len(false_samples)/n_samples,2)*100 )
    return pd.concat([true_samples, false_samples]).sample(frac=1, random_state=seed)  # Shuffle the dataset

# Your DataFrame

df = euadr_unmasked.copy()

# Sample sizes
sample_sizes = [12, 51, 89, 178, 258, 349, 450, 582, 706, 798]

# Dictionary to hold the sample data
samples = {}

# Sampling data for each sample size
for i, size in enumerate(sample_sizes):
    samples[f'sample_{size}'] = sample_data_fixed(df, size)
    print(f"Sampled {size}:", len(samples[f'sample_{size}']))
    samples[f'sample_{size}'].to_csv(f"./datasets/gad_samples/{seed}/{size}_sample_val_gad.csv", index=True)
# Example to print the sampled data sizes
for size in sample_sizes:
    print(f"Sampled {size}:", len(samples[f'sample_{size}']))

# of True and False samples : 8 4 67.0 33.0
Sampled 12: 12
# of True and False samples : 35 16 69.0 31.0
Sampled 51: 51
# of True and False samples : 62 27 70.0 30.0
Sampled 89: 89
# of True and False samples : 124 54 70.0 30.0
Sampled 178: 178
# of True and False samples : 180 78 70.0 30.0
Sampled 258: 258
# of True and False samples : 244 105 70.0 30.0
Sampled 349: 349
# of True and False samples : 315 135 70.0 30.0
Sampled 450: 450
# of True and False samples : 407 175 70.0 30.0
Sampled 582: 582
# of True and False samples : 494 212 70.0 30.0
Sampled 706: 706
# of True and False samples : 558 240 70.0 30.0
Sampled 798: 798
Sampled 12: 12
Sampled 51: 51
Sampled 89: 89
Sampled 178: 178
Sampled 258: 258
Sampled 349: 349
Sampled 450: 450
Sampled 582: 582
Sampled 706: 706
Sampled 798: 798


In [255]:
# sample_10.to_csv(f"./datasets/{seed}/GAD/10_sample_val_gad.csv",index=True)
# sample_20.to_csv(f"./datasets/{seed}/GAD/20_sample_val_gad.csv",index=True)
# sample_30.to_csv(f"./datasets/{seed}/GAD/30_sample_val_gad.csv",index=True)
# sample_40.to_csv(f"./datasets/{seed}/GAD/40_sample_val_gad.csv",index=True)
# sample_50.to_csv(f"./datasets/{seed}/GAD/50_sample_val_gad.csv",index=True)
# sample_60.to_csv(f"./datasets/{seed}/GAD/60_sample_val_gad.csv",index=True)
# sample_70.to_csv(f"./datasets/{seed}/GAD/70_sample_val_gad.csv",index=True)
# sample_80.to_csv(f"./datasets/{seed}/GAD/80_sample_val_gad.csv",index=True)
# sample_90.to_csv(f"./datasets/{seed}/GAD/90_sample_val_gad.csv",index=True)
# sample_100.to_csv(f"./datasets/{seed}/GAD/100_sample_val_gad.csv",index=True)

### llama2 GAD

In [220]:
import pandas as pd


In [226]:
llama2_gad_out = pd.read_csv("/home/016651544/llama2/test_df_10000_25_FT_model_full_test.csv")
llama2_gad_out.head()

Unnamed: 0,sentence,actual_output,predicted_output
0,These results suggest that the C1772T polymorp...,True,True
1,"In our setting, @DISEASE$ among alcoholic indi...",True,True
2,MPO genotype GG is associated with @DISEASE$ i...,True,True
3,These three studies do not provide consistent ...,True,True
4,Our prospective findings suggest that individu...,True,True


In [227]:
def change_nan(row):
    if pd.isna(row[2]) :# or row[2].lower() == "nan":
        return not row[1]
    elif "true" in row[2].lower():
        return True
    elif "false" in row[2].lower():
        return False
    else:
        return not row[1]
llama2_gad_out.predicted_output = llama2_gad_out.apply(lambda x: change_nan(x),axis=1)


  if pd.isna(row[2]) :# or row[2].lower() == "nan":
  elif "true" in row[2].lower():
  elif "false" in row[2].lower():


In [228]:
llama2_gad_out.predicted_output.value_counts()

predicted_output
True     2796
False    2534
Name: count, dtype: int64

In [229]:
print(llama2_gad_out.isna().sum())
llama2_gad_out.dropna(inplace=True)


sentence            0
actual_output       0
predicted_output    0
dtype: int64


In [263]:
# llama2_gad_out.to_csv("/home/016651544/model_predictions_files/llama2_gad_predictions.csv",index=False)

In [230]:
# Generate classification report
from sklearn.metrics import precision_score, recall_score, f1_score
actual = llama2_gad_out['actual_output'].astype(str)
predicted = llama2_gad_out['predicted_output'].astype(str)
# Calculate Precision, Recall, and F1 score
precision = precision_score(actual, predicted, average='macro')
recall = recall_score(actual, predicted, average='macro')
f1 = f1_score(actual, predicted, average='macro')
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 score: ",f1)
report = classification_report(actual, predicted)
print(report)

Precision:  0.9978849873480324
Recall:  0.9979788187831579
F1 score:  0.9979310182499153
              precision    recall  f1-score   support

       False       1.00      1.00      1.00      2529
        True       1.00      1.00      1.00      2801

    accuracy                           1.00      5330
   macro avg       1.00      1.00      1.00      5330
weighted avg       1.00      1.00      1.00      5330



### GEMMA 7b - GAD

In [264]:
gemma_gad_out = pd.read_csv("/home/016651544/gemma/test_df_10000_25_FT_model_gad_1_1_7b_gemma_5samples.csv")
gemma_gad_out.head()

Unnamed: 0,sentence,actual_output,predicted_output
0,These results suggest that the C1772T polymorp...,True,True
1,"In our setting, @DISEASE$ among alcoholic indi...",True,True
2,MPO genotype GG is associated with @DISEASE$ i...,True,True
3,These three studies do not provide consistent ...,True,True
4,Our prospective findings suggest that individu...,True,True


In [265]:
gemma_gad_out.predicted_output = gemma_gad_out.apply(lambda x: change_nan(x),axis=1)

  if pd.isna(row[2]) :# or row[2].lower() == "nan":
  elif "true" in row[2].lower():
  elif "false" in row[2].lower():
  return not row[1]
  return not row[1]


In [266]:
gemma_gad_out.shape

(5330, 3)

In [267]:
gemma_gad_out.predicted_output.value_counts()

predicted_output
True     2745
False    2585
Name: count, dtype: int64

In [268]:
# gemma_gad_out.predicted_output.value_counts()
print(gemma_gad_out.isna().sum())
gemma_gad_out.dropna(inplace=True)

sentence            0
actual_output       0
predicted_output    0
dtype: int64


In [269]:
from sklearn.metrics import classification_report
# Convert actual and predicted columns to boolean
# gemma_gad_out['actual_output'] = gemma_gad_out['actual_output'].astype(str).str.upper()
# gemma_gad_out['predicted_output'] = gemma_gad_out['predicted_output'].astype(str).str.upper()

In [270]:
gemma_gad_out.head()

Unnamed: 0,sentence,actual_output,predicted_output
0,These results suggest that the C1772T polymorp...,True,True
1,"In our setting, @DISEASE$ among alcoholic indi...",True,True
2,MPO genotype GG is associated with @DISEASE$ i...,True,True
3,These three studies do not provide consistent ...,True,True
4,Our prospective findings suggest that individu...,True,True


In [272]:
# gemma_gad_out.to_csv("/home/016651544/model_predictions_files/gemma_7b_gad_predictions.csv",index=False)


In [273]:
gemma_gad_out.predicted_output.value_counts(),gemma_gad_out.actual_output.value_counts(),


(predicted_output
 True     2745
 False    2585
 Name: count, dtype: int64,
 actual_output
 True     2801
 False    2529
 Name: count, dtype: int64)

In [274]:
!pwd

/home/016651544/gemma


In [275]:
gemma_gad_out[gemma_gad_out.predicted_output != "NAN"].to_csv("/home/016651544/gemma/GAD_gemma_7b_output.csv",index=False)

In [276]:
# Generate classification report
from sklearn.metrics import precision_score, recall_score, f1_score
actual = gemma_gad_out['actual_output']
predicted = gemma_gad_out['predicted_output']
# Calculate Precision, Recall, and F1 score
precision = precision_score(actual, predicted, average='macro')
recall = recall_score(actual, predicted, average='macro')
f1 = f1_score(actual, predicted, average='macro')
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 score: ",f1)
report = classification_report(gemma_gad_out['actual_output'], gemma_gad_out['predicted_output'])
print(report)

Precision:  0.9786522356456084
Recall:  0.9794695703350593
F1 score:  0.9789523001071425
              precision    recall  f1-score   support

       False       0.97      0.99      0.98      2529
        True       0.99      0.97      0.98      2801

    accuracy                           0.98      5330
   macro avg       0.98      0.98      0.98      5330
weighted avg       0.98      0.98      0.98      5330



### GEMMA-2b - GAD

In [277]:
import pandas as pd
gemma_gad_out = pd.read_csv("/home/016651544/gemma/test_df_10000_25_FT_model_gad_base_gemma.csv")
gemma_gad_out.head()

Unnamed: 0,sentence,actual_output,predicted_output
0,These results suggest that the C1772T polymorp...,True,True
1,"In our setting, @DISEASE$ among alcoholic indi...",True,True
2,MPO genotype GG is associated with @DISEASE$ i...,True,True
3,These three studies do not provide consistent ...,True,True
4,Our prospective findings suggest that individu...,True,False


In [278]:
gemma_gad_out.predicted_output = gemma_gad_out.apply(lambda x: change_nan(x),axis=1)


  if pd.isna(row[2]) :# or row[2].lower() == "nan":
  elif "true" in row[2].lower():
  elif "false" in row[2].lower():
  return not row[1]
  return not row[1]


In [279]:
gemma_gad_out.shape

(5330, 3)

In [280]:
# from sklearn.metrics import classification_report
# # Convert actual and predicted columns to boolean
# gemma_gad_out['actual_output'] = gemma_gad_out['actual_output'].astype(str).str.upper()
# gemma_gad_out['predicted_output'] = gemma_gad_out['predicted_output'].astype(str).str.upper()

In [281]:
# gemma_gad_out.head()

In [282]:
# gemma_gad_out.predicted_output.value_counts()
print(gemma_gad_out.isna().sum())
# gemma_gad_out.dropna(subset=['predicted_output'], axis=0)
# gemma_gad_out = gemma_gad_out[gemma_gad_out['predicted_output'] != "NAN"]
# gemma_gad_out.fillna(0,inplace=True)
gemma_gad_out.dropna(inplace=True)


sentence            0
actual_output       0
predicted_output    0
dtype: int64


In [283]:
# gemma_gad_out.predicted_output.value_counts(),gemma_gad_out.actual_output.value_counts(),
# gemma_gad_out = gemma_gad_out[gemma_gad_out.predicted_output != "NAN"]
# gemma_gad_out.to_csv("/home/016651544/model_predictions_files/gemma_2b_gad_predictions.csv",index=False)


In [249]:
# Generate classification report
actual = gemma_gad_out['actual_output']
predicted = gemma_gad_out['predicted_output']
# Calculate Precision, Recall, and F1 score
precision = precision_score(actual, predicted, average='macro')
recall = recall_score(actual, predicted, average='macro')
f1 = f1_score(actual, predicted, average='macro')
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 score: ",f1)
report = classification_report(gemma_gad_out['actual_output'], gemma_gad_out['predicted_output'])
print(report)

Precision:  0.7202341452434836
Recall:  0.7000060278985828
F1 score:  0.6976302661237676
              precision    recall  f1-score   support

       False       0.76      0.56      0.64      2529
        True       0.68      0.84      0.75      2801

    accuracy                           0.71      5330
   macro avg       0.72      0.70      0.70      5330
weighted avg       0.72      0.71      0.70      5330



In [250]:
# (84 + 56)/2

In [251]:
# gemma_gad_out[gemma_gad_out.predicted_output != "NAN"].to_csv("/home/016651544/gemma/GAD_gemma_2b_output.csv",index=False)
