In [1]:
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px

### Data

In [3]:
### Sample sheet
file_path = 'Data/TCGA/Gene level CNV/gdc_sample_sheet.2025-11-03.tsv'
sample_sheet = pd.read_csv(file_path, sep = '\t')

In [4]:
sample_sheet

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Tissue Type,Tumor Descriptor,Specimen Type,Preservation Method
0,21f39dce-e4a6-41e0-a0ab-876539c9b890,TCGA-HNSC.5d07e7cd-558a-41ac-b066-b4f234522a96...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,"TCGA-CQ-A4C7, TCGA-CQ-A4C7","TCGA-CQ-A4C7-10A, TCGA-CQ-A4C7-01A","Normal, Tumor","Not Applicable, Primary","Peripheral Blood NOS, Solid Tissue","Unknown, Unknown"
1,5a33ddaa-89bc-4062-9a52-be54702306d7,TCGA-HNSC.5d07e7cd-558a-41ac-b066-b4f234522a96...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,TCGA-CQ-A4C7,TCGA-CQ-A4C7-01A,Tumor,Primary,Solid Tissue,Unknown
2,6b93f364-b672-464d-a418-f22a673b5a8b,TCGA-HNSC.5d07e7cd-558a-41ac-b066-b4f234522a96...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,"TCGA-CQ-A4C7, TCGA-CQ-A4C7","TCGA-CQ-A4C7-10A, TCGA-CQ-A4C7-01A","Normal, Tumor","Not Applicable, Primary","Peripheral Blood NOS, Solid Tissue","Unknown, Unknown"
3,4e464403-f3aa-4c3b-9b9b-885163810b60,TCGA-HNSC.a2d2b82f-f217-4aed-97da-c6ba3f5ce4dc...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,"TCGA-BA-4077, TCGA-BA-4077","TCGA-BA-4077-01B, TCGA-BA-4077-10A","Tumor, Normal","Primary, Not Applicable","Unknown, Peripheral Blood NOS","Unknown, Unknown"
4,827b6345-4838-4d54-bb8a-db0cc38f5b9b,TCGA-HNSC.a2d2b82f-f217-4aed-97da-c6ba3f5ce4dc...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,"TCGA-BA-4077, TCGA-BA-4077","TCGA-BA-4077-01B, TCGA-BA-4077-10A","Tumor, Normal","Primary, Not Applicable","Unknown, Peripheral Blood NOS","Unknown, Unknown"
...,...,...,...,...,...,...,...,...,...,...,...
1543,659840dd-4d86-45d6-822b-8919ce76925c,TCGA-HNSC.ebd9488f-5ae7-49c7-813d-308b00ec5f05...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,TCGA-C9-A480,TCGA-C9-A480-01A,Tumor,Primary,Solid Tissue,OCT
1544,a39819a5-1482-4911-be8f-c21619dc8346,TCGA-HNSC.58c963a0-937b-4e29-9d36-7bcf83d12fea...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,TCGA-CR-6470,TCGA-CR-6470-01A,Tumor,Primary,Solid Tissue,Unknown
1545,e0ed5b1f-a478-47d4-8b31-2000c3ed3e11,TCGA-HNSC.8383139c-78ce-4fec-ace5-479cb3ca3c8e...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,TCGA-CQ-6228,TCGA-CQ-6228-01A,Tumor,Primary,Solid Tissue,Unknown
1546,251227bb-45e1-43a7-9166-698c31e225a7,TCGA-HNSC.ca39edab-2932-4e62-9d5c-4588dff4c2ee...,Copy Number Variation,Gene Level Copy Number,TCGA-HNSC,TCGA-CV-6941,TCGA-CV-6941-01A,Tumor,Primary,Solid Tissue,Unknown


In [5]:
def num_overlap(list1,list2):
    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the intersection of the two sets
    overlap = set1.intersection(set2)

    # Get the number of overlapping values
    num_overlap = len(overlap)
    return num_overlap

#### Gene data

In [6]:
hpv_positive_genes = pd.read_csv('Results/SOM results/HPV positive genes.csv')
hpv_negative_genes = pd.read_csv('Results/SOM results/HPV negative genes.csv')

### Final results

In [7]:
def define_cutoffs(drug_candidates_df, title):
    """
    Args:
        drug_candidates_df (pd.DataFrame): DataFrame containing drug candidates with targetability scores.
        assumes chemo and immuno columns are already defined in the script.
    Define cutoffs for drug candidates based on their targetability scores.
    """
    chart_title = ''
    if 'positive' in title.lower():
        chart_title += 'HPV Positive '
    elif 'negative' in title.lower():
        chart_title += 'HPV Negative '

    chart_title += ' Drug Candidates'

    ######################################### Chemotherapy and Immunotherapy Graphs #########################################
    ### show distributions by chemotherapy and immunotherapy and other drugs
    ### graph of how many drugs are chemotherapy or immunotherapy or neither
    temp  = drug_candidates_df[['CHEMO', 'IMMUNO']].sum()
    temp['OTHER'] = len(drug_candidates_df) - (temp['CHEMO'] + temp['IMMUNO'])
    print(f'Number of chemotherapy drugs: {temp["CHEMO"]}')
    print(f'Number of immunotherapy drugs: {temp["IMMUNO"]}')
    print(f'Number of drugs that are neither: {temp["OTHER"]}')
    plt.figure(figsize=(10, 6))
    plt.ylabel('Number of Drugs')
    plt.xlabel('Drug Type')  # Move xlabel before plt.twinx()
    plt.ylabel('Number of Drugs')
    plt.bar(['Chemotherapy', 'Immunotherapy', 'Other'], [temp['CHEMO'], temp['IMMUNO'], temp['OTHER']], color=['blue', 'orange', 'gray'], alpha=0.7, label='Drug Type')
    plt.title('Number of Drugs by Type in HPV Positive Amplification Candidates')
    ### data labels
    for i, v in enumerate([temp['CHEMO'], temp['IMMUNO'], temp['OTHER']]):
        plt.text(i, v + 0.5, str(v), ha='center', va='bottom', fontsize=12)
    ### add second y axis for percentage
    plt.twinx()
    percentage = [temp['CHEMO']/len(drug_candidates_df)*100, 
                temp['IMMUNO']/len(drug_candidates_df)*100, 
                temp['OTHER']/len(drug_candidates_df)*100]

    ### make twinx a percent *100
    plt.ylabel('Percentage (%)')
    plt.yticks([0, 20, 40, 60, 80, 100], [0, 20, 40, 60, 80, 100])
    # plt.plot(['Chemotherapy', 'Immunotherapy', 'Other'], percentage, color='red', marker='o', linestyle='--', label='Percentage')
    ### x axis labels/title
    plt.xticks(['Chemotherapy', 'Immunotherapy', 'Other'])
    plt.xticks(rotation=0)
    ### add percentage labels on top of the bars, push below the top of the bar
    for i, v in enumerate(percentage):
        plt.text(i+.27, v, f'({v:.1f})%', ha='center', va='bottom', fontsize=12)
    plt.legend(loc='upper left')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


    ### plot unique drugs by type
    # Count unique drugs by type
    chemo_drugs = drug_candidates_df.loc[drug_candidates_df['CHEMO'] == 1, 'DRUG'].dropna().unique()
    immuno_drugs = drug_candidates_df.loc[drug_candidates_df['IMMUNO'] == 1, 'DRUG'].dropna().unique()
    other_drugs = drug_candidates_df.loc[
        (drug_candidates_df['CHEMO'] == 0) &
        (drug_candidates_df['IMMUNO'] == 0), 'DRUG'
    ].dropna().unique()

    chemo_count = len(chemo_drugs)
    immuno_count = len(immuno_drugs)
    other_count = len(other_drugs)
    plt.figure(figsize=(10, 6))
    plt.bar(['Chemotherapy', 'Immunotherapy', 'Other'], [chemo_count, immuno_count, other_count], color=['blue', 'orange', 'gray'], alpha=0.7)
    plt.title(f'Unique Drugs by Type in {title}')
    plt.xlabel('Drug Type')
    plt.ylabel('Number of Unique Drugs')
    ### Add data labels
    for i, v in enumerate([chemo_count, immuno_count, other_count]):
        plt.text(i, v + 0.5, str(v), ha='center', va='bottom', fontsize=12)
    plt.xticks(rotation=0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    ### plot for pirority norm scores
    plt.figure(figsize=(10, 6))
    plt.hist(drug_candidates_df['PRIORITY_NORM_SCORE'], bins=50, alpha=0.7, color='blue', label='Priority Norm Score')
    plt.xlabel('Priority Norm Score')
    plt.ylabel('Count')
    plt.title(f'Distribution of Priority Norm Scores for {title}')
    plt.twinx()
    percentage = drug_candidates_df['PRIORITY_NORM_SCORE'].value_counts(normalize=True) * 100
    plt.ylabel('Percentage (%) of gene-drug interactions')
    plt.yticks([0, 20, 40, 60, 80, 100], [0, 20, 40, 60, 80, 100])
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.axvline(x=drug_candidates_df['PRIORITY_NORM_SCORE'].mean(), color='red', linestyle='--', label=f'Mean priority norm score: {drug_candidates_df["PRIORITY_NORM_SCORE"].mean():.2f}')
    plt.axvline(x=drug_candidates_df['PRIORITY_NORM_SCORE'].median(), color='green', linestyle='--', label=f'Median priority norm score: {drug_candidates_df["PRIORITY_NORM_SCORE"].median():.2f}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend()
    plt.show()

    normalized_priority_norm_score_cutoff = drug_candidates_df['PRIORITY_NORM_SCORE'].mean()
    print(f'Normalized Priority Norm Score Cutoff: {normalized_priority_norm_score_cutoff:.2f}')

    ### Plot drug-gene interaction scores with two axis for percentage and count 
    plt.figure(figsize=(10, 6))
    plt.hist(drug_candidates_df['DRUG-GENE-INTERACTION_SCORE'], bins=50, alpha=0.7, color='blue', label='Priority Norm Score')
    plt.xlabel('Drug-Gene Interaction Score')
    plt.ylabel('Count')
    plt.title(f'Distribution of Drug-Gene Interaction Scores for {title}')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.twinx()
    plt.ylabel('Percentage (%) of gene-drug interactions')
    plt.yticks([0, 20, 40, 60, 80, 100], [0, 20, 40, 60, 80, 100])
    percentage = drug_candidates_df['DRUG-GENE-INTERACTION_SCORE'].value_counts(normalize=True) * 100
    plt.axvline(x=drug_candidates_df['DRUG-GENE-INTERACTION_SCORE'].mean(), color='red', linestyle='--', label=f'Mean drug-gene interaction score: {drug_candidates_df["DRUG-GENE-INTERACTION_SCORE"].mean():.2f}')
    plt.axvline(x=drug_candidates_df['DRUG-GENE-INTERACTION_SCORE'].median(), color='green', linestyle='--', label=f'Median drug-gene interaction score: {drug_candidates_df["DRUG-GENE-INTERACTION_SCORE"].median():.2f}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend()
    plt.show()

    ### plot normalized drug-gene interaction scores with two axis for percentage and count
    plt.figure(figsize=(10, 6))
    plt.hist(drug_candidates_df['norm_DRUG-GENE-INTERACTION_SCORE'], bins=50, alpha=0.7, color='blue', label='Normalized Drug-Gene Interaction Score')
    plt.xlabel('Normalized Drug-Gene Interaction Score')
    plt.ylabel('Count')
    plt.title(f'Distribution of Normalized Drug-Gene Interaction Scores for {title}')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.twinx()
    plt.ylabel('Percentage (%) of gene-drug interactions')
    plt.yticks([0, 20, 40, 60, 80, 100], [0, 20, 40, 60, 80, 100])
    percentage = drug_candidates_df['norm_DRUG-GENE-INTERACTION_SCORE'].value_counts(normalize=True) * 100
    plt.axvline(x=drug_candidates_df['norm_DRUG-GENE-INTERACTION_SCORE'].mean(), color='red', linestyle='--', label=f'Mean normalized drug-gene interaction score: {drug_candidates_df["norm_DRUG-GENE-INTERACTION_SCORE"].mean():.2f}')
    plt.axvline(x=drug_candidates_df['norm_DRUG-GENE-INTERACTION_SCORE'].median(), color='green', linestyle='--', label=f'Median normalized drug-gene interaction score: {drug_candidates_df["norm_DRUG-GENE-INTERACTION_SCORE"].median():.2f}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend()
    plt.show()

    normalized_drug_gene_interaction_score_cutoff = drug_candidates_df['norm_DRUG-GENE-INTERACTION_SCORE'].mean()
    print(f'Normalized Drug-Gene Interaction Score Cutoff: {normalized_drug_gene_interaction_score_cutoff:.2f}')

    ### plot the frequency of drugnome scores with frequency and percentage
    plt.figure(figsize=(10, 6))
    plt.hist(drug_candidates_df['norm_DRUGNOMEAI SUPPORT'], bins=50, color='blue', alpha=0.7)
    plt.title(f'Distribution of Drugnome Scores for {title}')
    plt.xlabel('Drugnome Score')
    plt.ylabel('Frequency')
    plt.twinx()
    percentage = drug_candidates_df['norm_DRUGNOMEAI SUPPORT'].value_counts(normalize=True) * 100
    plt.ylabel('Percentage (%)')
    plt.yticks([0, 20, 40, 60, 80, 100], [0, 20, 40, 60, 80, 100])
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.axvline(x=drug_candidates_df['norm_DRUGNOMEAI SUPPORT'].mean(), color='red', linestyle='--', label= f'Mean drugnome score: {drug_candidates_df["norm_DRUGNOMEAI SUPPORT"].mean():.2f}')
    plt.legend()
    plt.tight_layout()
    plt.show()

    normalized_drugnome_ai_score_cutoff = drug_candidates_df['norm_DRUGNOMEAI SUPPORT'].mean()
    print(f'Normalized Drugnome AI Score Cutoff: {normalized_drugnome_ai_score_cutoff:.2f}')


    ### plot distribution of cancer-sm scores
    plt.figure(figsize=(10, 6))
    plt.hist(drug_candidates_df['Cancer_targetability'], bins=50, color='green', alpha=0.7)
    plt.title(f'Distribution of Cancer-Targetability Scores for {title}')
    plt.xlabel('Cancer-Targetability Score')
    plt.ylabel('Frequency')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.axvline(x=drug_candidates_df['Cancer_targetability'].mean(), color='red', linestyle='--', label=f'Threshold ({drug_candidates_df['Cancer_targetability'].mean():.2f})')
    plt.axvline(x=drug_candidates_df['Cancer_targetability'].quantile(0.6), color='green', linestyle='--', label=f'upper quantile ({drug_candidates_df['Cancer_targetability'].quantile(0.6):.2f})')
    plt.legend()
    plt.tight_layout()
    plt.show()

    cancer_targetability_score_cutoff = drug_candidates_df['Cancer_targetability'].quantile(0.6)
    print(f'Cancer Targetability Score Cutoff: {cancer_targetability_score_cutoff:.2f}')
    print(f'Cancer Targetability Score Mean: {drug_candidates_df["Cancer_targetability"].mean():.2f}')
    print("Upper quantile of cancer targetability was chosen as it is crucial in cancer specific drugability")


    return {
        'normalized_priority_norm_score_cutoff': normalized_priority_norm_score_cutoff,
        'normalized_drug_gene_interaction_score_cutoff': normalized_drug_gene_interaction_score_cutoff,
        'normalized_drugnome_ai_score_cutoff': normalized_drugnome_ai_score_cutoff,
        'cancer_targetability_score_cutoff': cancer_targetability_score_cutoff,
    }


In [8]:
chemotherapy_medications = ['cisplatin', 'carboplatin', 'paclitaxel', 'docetaxel', '5-fluorouracil', '5-fu', 'fluorouracil','capecitabine', 'gemcitabine', 'doxorubicin', 'adriamycin', 'epirubicin', 'cyclophosphamide', 'cyclophosphamide', 'ifosfamide', 'etoposide', 'irinotecan', 'topotecan', 'vinorelbine', 'vinblastine', 'vincristine', 'bleomycin', 'mitomycin', 'mitoxantrone', 'oxaliplatin', 'pemetrexed', 'docetaxel', 'paclit', 'hydroxyurea', 'taxotere']
immuno_medications = ['pembrolizumab', 'toripalimab', 'nivolumab', 'avastin', 'bevacizumab', 'erbitux', 'cetuximab', 'keytruda', 'opdivo', 'yervoy', 'ipilimumab', 'revlimid', 'lenalidomide', 'thalomid', 'thalidomide']

In [9]:
### Function for displaying top genes and their immediate neighbors
def display_top_genes(top_gene_df, Drug_bank = Drug_bank, protein_interaction_df = protein_interaction):
    """
    Function to display top genes focusing on those in DRUG BANK and PPI database.

    """
    top_genes_connected = top_gene_df[top_gene_df['gene_name'].isin(Drug_bank['gene']) | top_gene_df['gene_name'].isin(dgidb['gene_name']) |
                                     top_gene_df['gene_name'].isin(protein_interaction['Translated_protein_1']) ] .sort_values(by = ['gistic_score',  'q_value'], ascending = [False, True])

    return top_genes_connected


def display_top_genes_and_neighbors(top_gene_df, Drug_bank = Drug_bank, protein_interaction_df = protein_interaction):
    """
    Function to display top genes and their immediate neighbors from PPI database.

    """
    top_genes_connected = protein_interaction_df[protein_interaction_df['Translated_protein_1'].isin(top_gene_df['gene_name'])]
    top_genes_connected = top_genes_connected[top_genes_connected['combined_score']>700] # PPI cut off for high confidence protien protein interactions
    total_genes = len(set(top_genes_connected['Translated_protein_1']).union(set(top_genes_connected['Translated_protein_2'])))
    print(f"Number of risk genes identified from CNV analysis: {len(top_gene_df['gene_name'].unique())}")
    print(f"Number of risk genes available in the PPI database: {len((top_genes_connected['Translated_protein_1'].unique()))}")
    print(f"Total number of immediate neighbors found through PPI: {len(top_genes_connected['Translated_protein_2'].unique())}")
    print(f'Total number of genes both risk genes and immediate neighbors in the protein interaction database: {total_genes}')
    print("These are the top gene candidates along with the genes found to be immediate neighbors through the STRING protein interaction database.")

    top_genes_direct = top_gene_df[top_gene_df['gene_name'].isin(Drug_bank['gene']) | top_gene_df['gene_name'].isin(dgidb['gene_name']) |
                                     top_gene_df['gene_name'].isin(protein_interaction['Translated_protein_1']) ] .sort_values(by = ['gistic_score',  'q_value'], ascending = [False, True])


    top_genes_connected_and_direct = pd.merge(top_genes_connected, top_genes_direct, left_on='Translated_protein_1', right_on='gene_name', how='left')
    top_genes_connected_and_direct = top_genes_connected_and_direct[['Translated_protein_1', 'Translated_protein_2','gene_name', 'combined_score', 'gistic_score', 'q_value','empirical_q_value', 'frequency_percentage', 'significant']]
    ### create df of top risk genes with a column of comma aggregated immediate neighbors, and another column of count of immediate neighbors
    top_genes_connected_and_direct_agg = top_genes_connected_and_direct.groupby('Translated_protein_1').agg({
        'Translated_protein_2': lambda x: ', '.join(x),
        'gene_name': 'first', # take first gene name as it is same for each risk gene
        'combined_score': 'max', # take max combined score of any connection
        'gistic_score': 'first', # take first gistic score as it is same for each risk gene
        'q_value': 'first', # take first q_value as it is same for each risk gene
        'empirical_q_value': 'first', # take first empirical_q_value as it is same for each risk gene
        'frequency_percentage': 'first', # take first frequency_percentage as it is same for each risk gene
        'significant': 'first' # take first significant as it is same for each risk gene
    }).reset_index()

    top_genes_connected_and_direct_agg.sort_values(by=['gistic_score', 'q_value'], ascending=[False, True], inplace=True)

    return top_genes_connected_and_direct_agg

NameError: name 'Drug_bank' is not defined

In [None]:
hpv_positive_drug_candidates = pd.read_csv('Results/SOM results/HPV positive drug candidates.csv')
hpv_positive_drug_candidates = hpv_positive_drug_candidates.merge(drugnome_probs_oncology, left_on='GENE_TARGET', right_on='Gene Name', how='left')
hpv_negative_drug_candidates = pd.read_csv('Results/SOM results/HPV negative drug candidates.csv')
hpv_negative_drug_candidates = hpv_negative_drug_candidates.merge(drugnome_probs_oncology, left_on='GENE_TARGET', right_on='Gene Name', how='left')

FileNotFoundError: [Errno 2] No such file or directory: 'Results/SOM results/HPV positive drug candidates.csv'

In [None]:
hpv_positive_drug_candidates = hpv_positive_drug_candidates[~hpv_positive_drug_candidates['DRUG'].isnull()]

hpv_negative_drug_candidates = hpv_negative_drug_candidates[~hpv_negative_drug_candidates['DRUG'].isnull()]


In [None]:
hpv_positive_drug_candidates

Unnamed: 0,GENE_TARGET,CONNECTED_TO,PPI_SCORE,DRUG,APPROVED,DRUG-GENE-INTERACTION_SCORE,DRUG-GENE-INTERACTION_TYPE,DRUG_BANK SUPPORT,DRUGNOMEAI SCORE,Gene Name,Cancer-sm,non-CPD-ab,CPD-sm,non-CPD-sm,CPD-ab,Cancer_targetability,CHEMO,IMMUNO
0,CDKN2A,,,ABEMACICLIB,1.0,0.744036,,False,"1599 5.8527\nName: drugnome_score, dtype: f...",CDKN2A,0.9528,0.1449,0.0834,0.0119,0.3898,1.4260,0,0
1,CDKN2A,,,LETROZOLE,1.0,0.112196,,False,"1599 5.8527\nName: drugnome_score, dtype: f...",CDKN2A,0.9528,0.1449,0.0834,0.0119,0.3898,1.4260,0,0
2,CDKN2A,,,ABEMACICLIB,1.0,0.744036,,False,"1599 5.8527\nName: drugnome_score, dtype: f...",CDKN2A,0.9528,0.1449,0.0834,0.0119,0.3898,1.4260,0,0
3,CDKN2A,,,PANITUMUMAB,1.0,0.100976,,False,"1599 5.8527\nName: drugnome_score, dtype: f...",CDKN2A,0.9528,0.1449,0.0834,0.0119,0.3898,1.4260,0,0
4,CDKN2A,,,CETUXIMAB,1.0,0.112196,,False,"1599 5.8527\nName: drugnome_score, dtype: f...",CDKN2A,0.9528,0.1449,0.0834,0.0119,0.3898,1.4260,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43370,SRC,CDKN2A,745.0,CENISERTIB,0.0,0.008240,,False,"110 10.809\nName: drugnome_score, dtype: fl...",SRC,0.9988,0.7611,0.9987,0.8834,0.9512,2.9487,0,0
43371,SRC,CDKN2A,745.0,PD-0166285,0.0,0.018699,inhibitor,False,"110 10.809\nName: drugnome_score, dtype: fl...",SRC,0.9988,0.7611,0.9987,0.8834,0.9512,2.9487,0,0
43372,SRC,CDKN2A,745.0,TIRBANIBULIN,1.0,0.777891,inhibitor,False,"110 10.809\nName: drugnome_score, dtype: fl...",SRC,0.9988,0.7611,0.9987,0.8834,0.9512,2.9487,0,0
43373,SRC,CDKN2A,745.0,CHEMBL:CHEMBL219557,0.0,0.324121,,False,"110 10.809\nName: drugnome_score, dtype: fl...",SRC,0.9988,0.7611,0.9987,0.8834,0.9512,2.9487,0,0


In [None]:
hpv_positive_drug_candidates['CHEMO'] = hpv_positive_drug_candidates['DRUG'].apply(lambda x: 1 if any(med in x.lower() for med in chemotherapy_medications) else 0)
hpv_positive_drug_candidates['IMMUNO'] = hpv_positive_drug_candidates['DRUG'].apply(lambda x: 1 if any(med in x.lower() for med in immuno_medications) else 0)
hpv_negative_drug_candidates['CHEMO'] = hpv_negative_drug_candidates['DRUG'].apply(lambda x: 1 if any(med in x.lower() for med in chemotherapy_medications) else 0)
hpv_negative_drug_candidates['IMMUNO'] = hpv_negative_drug_candidates['DRUG'].apply(lambda x: 1 if any(med in x.lower() for med in immuno_medications) else 0)