In [23]:
%load_ext autoreload
%autoreload 2
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import numpy as np
from src.utils import flatten_level_columns as flc

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns",999)
pd.set_option("display.max_rows",100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Information content

In [8]:
scored_9mers = pd.concat([pd.read_csv(f'../output/9mers_humanproteome_chunk_{x}scored.txt') for x in [0, 1, 2, 3]])
scored_8mers = pd.concat([pd.read_csv(f'../output/8mers_humanproteome_chunk_{x}scored.txt') for x in [0, 1, 2, 3]])
scored_10mers = pd.concat([pd.read_csv(f'../output/10mers_humanproteome_chunk_{x}scored.txt') for x in [0, 1, 2, 3]])
scored_11mers = pd.concat([pd.read_csv(f'../output/11mers_humanproteome_chunk_{x}scored.txt') for x in [0, 1, 2, 3]])

In [731]:
scored_9mers.query('HLA=="HLA-A02:01" and EL_Rank<=5')['Peptide'].to_csv('rank_5_peps.txt', index=False, header=False)

In [190]:
for x in scored_9mers.HLA.unique():
    print(f'{x}, N = {len(scored_9mers.query("HLA == @x and EL_Rank < 0.5"))}')

HLA-B35:01, N = 92237
HLA-B15:01, N = 103266
HLA-B07:02, N = 105471
HLA-A02:06, N = 109028
HLA-A02:01, N = 110177
HLA-A24:02, N = 92307
HLA-A03:01, N = 58403
HLA-A01:01, N = 50808
HLA-B27:05, N = 102726
HLA-A11:01, N = 94623


In [724]:
AA_KEYS = [x for x in 'ARNDCQEGHILKMFPSTWYV']

CHAR_TO_INT = dict((c,i) for i,c in enumerate(AA_KEYS))
INT_TO_CHAR = dict((i,c) for i,c in enumerate(AA_KEYS))

def onehot_encode(sequence):
    int_encoded = [CHAR_TO_INT[char] for char in sequence]
    onehot_encoded = list() 
    for value in int_encoded:
        letter = [0 for _ in range(len(AA_KEYS))]
        letter[value] = 1
        onehot_encoded.append(letter)
    return np.array(onehot_encoded)

def onehot_decode(onehot_sequence):
    return ''.join([INT_TO_CHAR[k.item()] for k in onehot_sequence.argmax(axis=1)])

def onehot_batch_encode(sequences):
    return np.stack([onehot_encode(x) for x in sequences])

def onehot_batch_decode(onehot_sequences):
    return np.stack([onehot_decode(x) for x in onehot_sequences])

def compute_pfm(sequences):
    """
    Computes the position frequency matrix given a list of sequences
    """
    N = len(sequences)
    onehot_seqs = onehot_batch_encode(sequences)
    return onehot_seqs.sum(axis=0)/N

def compute_ic_position(matrix, position):
    row = matrix[position]
    row_log20 = np.nan_to_num(np.log(row) / np.log(20), neginf=0)
    IC = 1+ np.sum(row*row_log20)
    return IC

def compute_ic_sequence(matrix):
    """
    returns the IC for sequences of a given length based on the frequency matrix
    """
    return np.array([compute_ic_position(matrix, pos) for pos in range(matrix.shape[0])])

def get_mia(IC_array, threshold=0.3):
    return np.where(IC_array<threshold)[0]

In [725]:
results = {'8mers':scored_8mers,
           '9mers':scored_9mers,
           '10mers':scored_10mers,
           '11mers':scored_11mers}

In [726]:
data=[]
for k in [8, 9, 10, 11]:
    for HLA in scored_9mers.HLA.unique():
        peptides=[x for x in results[f'{k}mers'].query("HLA == @HLA and EL_Rank<=1.0")['Peptide'] if "X" not in x]
        ics = compute_ic_sequence(compute_pfm(peptides))
        data.append([k, HLA, ics])
results = pd.DataFrame(data, columns = ['k', 'HLA', 'positions'])

In [727]:
results.query('HLA == "HLA-A02:01" and k == 9')['positions'].item()

array([0.08738859, 0.86406531, 0.05486578, 0.07117378, 0.03989642,
       0.07332876, 0.04793575, 0.05085757, 0.49828065])

In [728]:
get_mia(results.query('HLA == "HLA-A02:01" and k == 9')['positions'].item(), threshold=0.3)

array([0, 2, 3, 4, 5, 6, 7], dtype=int64)

In [729]:
results.rename(columns={'positions':'ics'}).to_csv('../output/df_ic_positions.csv')

# cedar

In [699]:
def query_ELIS(df):
    return df.query('`assay_method/technique` in ["ELISPOT", "ELISA"]').sort_values('epitope_description')

def keep_full_HLA(df):
    return df.query('`mhc_allele name`.str.contains(":")', engine = 'python')

def get_dupe_unique_df(df):
    """
    From the source df, get the unique (keeping first) AND the duplicates df, and the common indices
    """
    dup_df = df.loc[df.duplicated(subset='epitope_description', keep=False)].sort_values('epitope_description')
    unique_df = df.drop_duplicates(subset='epitope_description', keep ='first')
    common_indices = dup_df.index.join(unique_df.index, how = 'inner')
    return unique_df, dup_df, common_indices

def get_agg_label(dup_df):
    dup_df['label'] = dup_df['assay_qualitative measure'].apply(lambda x: 1 if 'Pos' in x else 0)
    agg_label = dup_df.groupby('epitope_description').agg({'label':"max"})
    gb = dup_df.groupby(['epitope_description', 'label']).agg({'label':"count"}).rename(columns={'label':'count'})
    return agg_label

def get_agg(dup_df):
    dup_df['label'] = dup_df['assay_qualitative measure'].apply(lambda x: 1 if 'Pos' in x else 0)
    
    gb = dup_df.groupby(['epitope_description', 'label']).agg({'label':"count"}).rename(columns={'label':'count'})#.reset_index()
    gb['percentage_pos'] = gb/gb.groupby(['epitope_description']).agg({'count':"sum"})
    agg = gb.reset_index().groupby(['epitope_description']).agg({'label':'max', 'percentage_pos':"max"}).rename(columns={'label':'agg_label'})
    agg.loc[agg['agg_label']==0, 'percentage_pos']=0
    agg['total'] = gb.reset_index().groupby('epitope_description').agg({'count':"sum"})
    return agg

def assign_agg_metrics(unique_df, agg_df, common_indices):
    unique_df['agg_label'] = unique_df['assay_qualitative measure'].apply(lambda x: 1 if 'Pos' in x else 0)
    unique_df['total_count'] = 1
    unique_df['percentage_pos'] = unique_df['agg_label']
    unique_df.loc[common_indices, 'agg_label'] = agg['agg_label'].values
    unique_df.loc[common_indices, 'total_count'] = agg['total'].values
    unique_df.loc[common_indices, 'percentage_pos'] = agg['percentage_pos'].values
    return unique_df

In [700]:
COLS= ['epitope_epitope id', 'epitope_description', 'epitope_starting position', 'epitope_ending position', 'epitope_antigen name', 
       'epitope_parent protein', 'related object_epitope relationship', 'related object_description', 'related object_parent protein',
       'mhc_allele name', 'mhc_allele evidence code', 'assay_method/technique', 'agg_label', 'total_count', 'percentage_pos']
# read dfs
epitopes = flc(pd.read_csv('../data/epitope_export_mhc1_TCR-MHC_220510.csv', header = [0,1]))
mhc = flc(pd.read_csv('../data/mhc_ligand_export_220510.csv', header = [0,1]))
df_tc = flc(pd.read_csv('../data/tcell_export_mhc1_220510.csv', header = [0,1]))

# refilters pos TC list, pos = positive at least once to ELISPOT or ELISA
# define neg as having a neg test and NO positive
# filters duplicates and keep only single entry with new label
elis = keep_full_HLA(query_ELIS(df_tc))
unique_df, dup_df, common_indices = get_dupe_unique_df(elis)

# Get aggregated label from duplicated epitope entries 
# Keep unique entries and re-assign labels
agg = get_agg(dup_df)
unique_df = assign_agg_metrics(unique_df, agg, common_indices)
filtered_df = unique_df[COLS]

In [667]:
filtered_df.to_csv('../data/filtered_epitope_tc.csv', index=False)

In [708]:
not_resolved = df_tc.query('not `mhc_allele name`.str.contains(":")', engine="python")['epitope_description'].unique()
mhc.loc[mhc['mhc_allele name'].str.contains(":")].drop_duplicates('epitope_description')\
   .query('epitope_description in @not_resolved').sort_values('epitope_description')[COLS[:-3]]

Unnamed: 0,epitope_epitope id,epitope_description,epitope_starting position,epitope_ending position,epitope_antigen name,epitope_parent protein,related object_epitope relationship,related object_description,related object_parent protein,mhc_allele name,mhc_allele evidence code,assay_method/technique
54,155,AAGIGILTV,27.0,35.0,Melanoma antigen recognized by T-cells 1,Melanoma antigen recognized by T-cells 1,,,Melanoma antigen recognized by T-cells 1,HLA-A*02:01,,purified MHC/competitive/radioactivity
3369,551,ACDPHSGHFV,,,,,neo-epitope,ARDPHSGHFV,,HLA-A*02:01,,purified MHC/direct/fluorescence
1056,189944,ALAGIGILTV,,,,,analog,EAAGIGILTV,,HLA-A*02:01,,x-ray crystallography
3316,1069080,ALDPHSGHFV,,,,,analog,ARDPHSGHFV,,HLA-A*02:01,,purified MHC/direct/fluorescence
15570,2489,ALDVYNGLL,299.0,307.0,Prostatic acid phosphatase precursor,Prostatic acid phosphatase,,,Prostatic acid phosphatase,HLA-A*02:01,,cellular MHC/direct/fluorescence
448,2688,ALLAVGATK,17.0,25.0,Melanocyte protein Pmel 17 precursor,Melanocyte protein PMEL,,,Melanocyte protein PMEL,HLA-A*03:01,,purified MHC/competitive/radioactivity
41606,1860817,CYASGWGSI,152.0,160.0,Prostate-specific antigen precursor,Prostate-specific antigen,,,Prostate-specific antigen,HLA-A*24:02,,binding assay
41640,1861123,DFIATLGKL,186.0,194.0,Prostatic acid phosphatase precursor,Prostatic acid phosphatase,,,Prostatic acid phosphatase,HLA-A*24:02,,cellular MHC/direct/fluorescence
56,10987,EAAGIGILTV,26.0,35.0,Melanoma antigen recognized by T-cells 1,Melanoma antigen recognized by T-cells 1,,,Melanoma antigen recognized by T-cells 1,HLA-A*02:01,,purified MHC/competitive/radioactivity
112,11010,EADPTGHSY,161.0,169.0,Melanoma-associated antigen 1,Melanoma-associated antigen 1,,,Melanoma-associated antigen 1,HLA-A*01:01,,purified MHC/competitive/radioactivity


In [718]:
lst = [x[:5]+'*'+x[5:] for x in scored_9mers.HLA.unique()]
query=filtered_df.query('`mhc_allele name` in @lst')
len(query), len(filtered_df)

(1166, 1802)

In [720]:
query.groupby('agg_label').agg({'epitope_description':'count'})

Unnamed: 0_level_0,epitope_description
agg_label,Unnamed: 1_level_1
0,641
1,525
