In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

###

targets = pd.read_pickle('./OT/Final/targets.pkl')

# See "Diseases.ipynb" for these files
custom = pd.read_excel('./Phenotyping/custom_phenotypes.xlsx')
custom['phecode1.2'] = custom['phecode1.2'].astype(float)
icd = pd.read_csv('./Phenotyping/icd_codes.csv')
merged_pheno = pd.read_excel('./Phenotyping/merged_phenotypes.xlsx')


# Gene-level features

In [None]:
# Open Targets target data 2024.09
genes = pd.read_pickle('./OT/Raw/targets.pkl')
genes = genes.loc[(genes['biotype'] == 'protein_coding')]
genes = genes[['approvedSymbol']].set_axis(['gene'],axis=1)
genes.to_pickle('./OT/Final/protein_coding_genes.pkl')


In [53]:
genes = pd.read_pickle('./OT/Final/protein_coding_genes.pkl')

# https://pharos.nih.gov
pharos = pd.read_csv('./Other/pharos.csv')
pharos = pharos[['Symbol','Target Development Level','IDG Family','Antibody Count']].set_axis(['gene','tdl','type','mab_count'],axis=1)
pharos['tdl'] = pharos['tdl'].fillna('Tdark')
pharos['type'] = pharos['type'].fillna('Other')
tp = genes.merge(pharos, how='left')

# Open Targets target prioritization view 2024.09
spec = pd.read_pickle('./OT/Raw/targetPrioritisation.pkl').rename({'targetId':'gene_id'},axis=1).merge(targets)
tp = tp.merge(spec[['gene','tissueSpecificity','tissueDistribution',
                    'hasLigand','hasSmallMoleculeBinder','hasPocket',
                    'mouseKOScore','isCancerDriverGene','paralogMaxIdentityPercentage']], how='left')

# https://www.proteinatlas.org/humanproteome/proteinclasses
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_location_Intracellular.tsv', sep='\t')['Gene']),'loc_intracellular'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_location_Membrane.tsv', sep='\t')['Gene']),'loc_membrane'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_location_Secreted.tsv', sep='\t')['Gene']),'loc_secreted'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_class_Plasma.tsv', sep='\t')['Gene']),'loc_plasma'] = 1

tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_class_Enzymes.tsv', sep='\t')['Gene']),'class_enzyme'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_class_Transporters.tsv', sep='\t')['Gene']),'class_transporter'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_class_Transcription.tsv', sep='\t')['Gene']),'class_tf'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_class_G-protein.tsv', sep='\t')['Gene']),'class_gpcr'] = 1
tp.loc[tp['gene'].isin(pd.read_csv('./Other/HPA/protein_class_Voltage-gated.tsv', sep='\t')['Gene']),'class_vgic'] = 1

tp.loc[tp['type'] == 'Enzyme', 'class_enzyme'] = 1
tp.loc[tp['type'] == 'Transcription Factor', 'class_tf'] = 1
tp.loc[tp['type'] == 'Transporter', 'class_transporter'] = 1
tp.loc[tp['type'] == 'Kinase', 'class_enzyme'] = 1
tp.loc[tp['type'] == 'GPCR', 'class_gpcr'] = 1
tp.loc[tp['type'] == 'oGPCR', 'class_gpcr'] = 1
tp.loc[tp['type'] == 'Ion Channel', 'class_vgic'] = 1
tp = tp.drop('type',axis=1)
tp.to_pickle('./Features/tp.pkl')

# https://gnomad.broadinstitute.org/data#v4-constraint
gn = pd.read_csv('./Other/gnomad.v4.1.constraint_metrics.tsv', sep='\t')
gn = gn.loc[gn['canonical'] == True].rename({'gene':'og_gene'},axis=1)
gn = gn.merge(targets, how='left')
gn['gene'] = gn['gene'].fillna(gn['og_gene'])
gn = gn.groupby('gene')[['lof_hc_lc.oe','lof_hc_lc.pLI','lof_hc_lc.pRec','lof_hc_lc.pNull',
                         'lof.oe','lof.pLI','lof.pRec','lof.pNull','lof.oe_ci.upper','lof.oe_ci.upper_bin_decile',
                         'mis.oe','mis_pphen.oe','syn.oe']].mean().reset_index()
for col in ['lof_hc_lc.pLI','lof_hc_lc.pRec','lof_hc_lc.pNull','lof.pLI','lof.pRec','lof.pNull']:
    gn[col] = -np.log10(gn[col])
    gn[col] = gn[col].replace([np.inf, -np.inf], 325)
gn.loc[:, gn.columns != 'gene'] = gn.loc[:, gn.columns != 'gene'].round(4)
gn.to_pickle('./Features/constraint.pkl')

# https://academic.oup.com/nar/advance-article/doi/10.1093/nar/gkae1079/7907365#494838589
gofcards = pd.read_excel('./GoFCards/gofcards_data_download.xlsx')[['genesymbol']].rename({'genesymbol':'gene'},axis=1)
gofcards['gofcards'] = 1

# https://omim.org/downloads/
om = pd.read_csv('./Other/genemap2.txt',sep='\t', skiprows=3)
om = om[['Approved Gene Symbol','Phenotypes']].dropna()
om['Phenotypes'] = om['Phenotypes'].str.split(';')
om = om.explode('Phenotypes')
om = om.loc[om['Phenotypes'].str.contains('(3)', regex=False)]
#om['Inheritance'] = om['Phenotypes'].str.split(',').str[-1]
om.loc[om['Phenotypes'].str.contains('Autosomal recessive', case=False), 'Inheritance'] = 'AR'
om.loc[om['Phenotypes'].str.contains('Autosomal dominant', case=False), 'Inheritance'] = 'AD'
om['value'] = 1
om = pd.pivot_table(om, index='Approved Gene Symbol', columns='Inheritance', values='value').reset_index()
om = om.set_axis(['gene','omim_ad','omim_ar'],axis=1)

# https://forum.depmap.org/t/common-essential-gene-list/2576
dpm = pd.read_csv('./Other/Gene Dependency Profile Summary.csv')
dpm = dpm.loc[dpm['Dataset'] == 'DependencyEnum.Chronos_Combined']
dpm = dpm[['Gene','Strongly Selective','Common Essential']]
dpm = dpm.set_axis(['gene','depmap_ss','depmap_ce'],axis=1)
dpm[['depmap_ss','depmap_ce']] = dpm[['depmap_ss','depmap_ce']].astype(int)

# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0307312
# https://osf.io/z4dcp/?view_only=
gld = pd.read_csv('./Other/pgof_svm_poly_2023-07-25.tsv', sep='\t')[['gene','pGOF']]
gld = gld.merge(pd.read_csv('./Other/pdn_svm_poly_2023-07-25.tsv', sep='\t')[['gene','pDN']], how='outer')
gld = gld.merge(pd.read_csv('./Other/plof_svm_poly_2023-07-28.tsv', sep='\t')[['gene','pLOF']], how='outer')
gld = gld.set_axis(['gene','badonyi_pgof','badonyi_pdn','badonyi_plof'],axis=1)

# https://www.sciencedirect.com/science/article/pii/S0092867422007887#sec4.1
cs = pd.read_csv('./Other/Collins_rCNV_2022.dosage_sensitivity_scores.tsv', sep='\t')
cs = cs.set_axis(['gene','collins_phaplo','collins_ptriplo'],axis=1)

# https://search.clinicalgenome.org/kb/gene-dosage?page=1&size=All&search=
cds = pd.read_csv('./Other/clingen_dosage_sensitivity.csv')
cds = cds.loc[cds['Type'] == 'Gene']
cds['gene'] = cds['Gene Symbol /Region Name'].str.split('HGNC').str[0]
cds['clingen_hi'] = cds['HI Score'].map({'AutosomalRecessive':-2,'SensitivityUnlikely':-1,
                    'SufficientEvidence':3,'EmergingEvidence':2,'LittleEvidence':1,'NoEvidence':0})
cds = cds[['gene','clingen_hi']].dropna().drop_duplicates()

# IMPC mouse essentiality
impc = pd.read_csv('./Other/impc_essential_genes_full_dataset.csv')
impc = impc.loc[impc['orthologue_category'] == 'GOOD']
impc = impc[['human_ensembl_gene_acc_id','impc_via_category']].dropna()
impc = impc.set_axis(['gene_id','impc_ko'],axis=1).merge(targets)
impc['impc_ko'] = impc['impc_ko'].map({'Homozygous-Viable':0,'Homozygous-Lethal':2,'Homozygous-Subviable':1})
impc = impc[['gene','impc_ko']].dropna()

# Protein length
up = pd.read_csv('./Other/uniprot_sequence.csv').rename({'Symbol':'gene'},axis=1)
up['protein_length'] = up['Sequence'].str.len()
up = up[['gene','protein_length']]

# Tau metric
gtex = pd.read_csv('./Other/GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct', skiprows=2, sep='\t')
gtex = gtex.drop('Description',axis=1).rename({'Name':'gene_id'},axis=1)
gtex['gene_id'] = gtex['gene_id'].str.split('.').str[0]
gtex = gtex.merge(targets).drop('gene_id',axis=1).groupby('gene').mean().reset_index()
df_log = np.log2(gtex.iloc[:, 1:] + 1)
mx = df_log.max(axis=1)
tau_vals = (1 - df_log.div(mx, axis=0)).sum(axis=1) / (df_log.shape[1] - 1)
gtex['gtex_tau'] = tau_vals
gtex = gtex[['gene','gtex_tau']]

# Pocket prediction
fscore = pd.read_pickle('./Features/fscore.pkl').fillna(0)

# OncoKB update 2024-12-19
cg = pd.read_csv('./Other/cancerGeneList.tsv', sep='\t')[['Hugo Symbol','Is Oncogene','Is Tumor Suppressor Gene','# of occurrence within resources (Column J-P)']]
cg = cg.set_axis(['gene','oncokb_oncogene','oncokb_suppressor','oncokb_sources'],axis=1)
cg['oncokb_oncogene'] = cg['oncokb_oncogene'].map({'Yes':1})
cg['oncokb_suppressor'] = cg['oncokb_suppressor'].map({'Yes':1})

ess = gofcards.merge(om, how='outer').merge(dpm, how='outer').merge(gld, how='outer')\
        .merge(cs, how='outer').merge(cds, how='outer').merge(impc, how='outer').merge(up, how='outer')\
        .merge(gtex, how='outer').merge(fscore, how='outer').merge(cg, how='outer')
ess = ess.groupby('gene').max().reset_index()
ess.to_pickle('./Features/ess_haplo.pkl')


  impc = pd.read_csv('./Other/impc_essential_genes_full_dataset.csv')


# Gene-disease-specific features

## Genebass

In [63]:
# Identify present phenotypes
gbp = pd.read_excel('./Genebass/pheno_results.xlsx')
gbp = gbp[['trait_type','phenocode','description','description_more','coding_description']]

a = custom[['phenocode','custom_code']].drop_duplicates().dropna().rename({'custom_code':'Code'},axis=1)

b = gbp.loc[gbp['trait_type'] == 'icd_first_occurrence']
b['Code'] = b['description'].str[5:8]
b = b.loc[b['Code'].isin(merged_pheno['Code'])][['phenocode','Code']]

c = gbp.loc[gbp['trait_type'] == 'icd10']
c['Code'] = c['phenocode'].copy()
c = c.loc[c['Code'].isin(merged_pheno['Code'])][['phenocode','Code']]

gbp = pd.concat([a,b,c]).drop_duplicates()

d = gbp[['Code']].drop_duplicates()
d['Genebass'] = 1
d.to_csv('./Phenotyping/genebass_present.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b['Code'] = b['description'].str[5:8]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c['Code'] = c['phenocode'].copy()


In [None]:
# Create gene burden features
gb = pd.read_csv('./Genebass/GB/gb.tsv', sep='\t')
gb = gb.merge(gbp).merge(targets)
gb = gb[['Code','gene','annotation','Pvalue_Burden','BETA_Burden']]
gb['Pvalue_Burden'] = -np.log10(gb['Pvalue_Burden'])
gb = gb.sort_values('Pvalue_Burden', ascending=False).drop_duplicates(['Code','gene','annotation'])
gb.loc[gb['BETA_Burden'] < 0, 'Pvalue_Burden'] = gb['Pvalue_Burden']*-1
gb = pd.pivot_table(gb, index=['Code','gene'], columns='annotation', values='Pvalue_Burden').reset_index()
gb = gb.set_axis(['Code','gene','genebass_gb_hclof_missense','genebass_gb_hclof','genebass_gb_hclof_lclof_missense'],axis=1)
gb.to_pickle('./Features/genebass_gb.pkl')


In [None]:
# Create single variant features
gb = pd.read_csv('./Genebass/GB/gb.tsv', sep='\t')
gb = gb[['gene_symbol','gene_id']].merge(targets).drop_duplicates()
sv = pd.read_pickle('./Genebass/SV/sv.pkl').rename({'gene':'gene_symbol'},axis=1)
sv = sv.merge(gbp).drop('phenocode',axis=1)
sv = sv.loc[sv['AC'] >= 10]
sv['Pvalue'] = -np.log10(sv['Pvalue'])
sv = sv.merge(gb, how='left')
sv.loc[sv['gene'].isna(), 'gene'] = sv['gene_symbol']
sv = sv.drop(['gene_symbol','gene_id','AF'],axis=1)
sv = sv.sort_values('Pvalue', ascending=False).drop_duplicates(['markerID','Code'])
sv.to_pickle('./Genebass/SV/sv_cleaned.pkl')

vep = pd.read_pickle('./Genebass/SV/vep_cleaned.pkl')
vep = vep[['ID','SYMBOL','annotation','hc_gof']]

sv = pd.read_pickle('./Genebass/SV/sv_cleaned.pkl')
sv = sv.rename({'markerID':'ID'},axis=1)
sv = sv.drop(['annotation','SE'],axis=1)
sv.loc[sv['Pvalue'] > 1000, 'Pvalue'] = sv.loc[sv['Pvalue'] < 1000]['Pvalue'].max()
sv = sv.merge(vep)
sv = sv.drop('gene',axis=1).rename({'SYMBOL':'gene'},axis=1)
sv = sv.sort_values('Pvalue',ascending=False)

gof = sv.loc[sv['hc_gof'].notna()].drop_duplicates(['gene','Code'])
gof.loc[gof['BETA'] < 0, 'Pvalue'] = gof['Pvalue']*-1
gof = gof[['Code','gene','Pvalue']].rename({'Pvalue':'genebass_sv_hcgof'},axis=1)

sv = sv.drop_duplicates(['gene','annotation','Code'])
sv.loc[sv['BETA'] < 0, 'Pvalue'] = sv['Pvalue']*-1
sv = pd.pivot_table(sv, index=['Code','gene'], columns='annotation', values='Pvalue').reset_index()
sv = sv.set_axis(['Code','gene','genebass_sv_hclof','genebass_sv_lclof',
                               'genebass_sv_lof_0.5missense','genebass_sv_lof_missense',
                               'genebass_sv_gof_0.5missense','genebass_sv_gof_missense',
                               'genebass_sv_other_0.5missense','genebass_sv_other_missense'],axis=1)
sv = sv.merge(gof, on=['Code','gene'], how='outer')
sv = sv.dropna(thresh=3, axis=0)
sv.to_pickle('./Features/genebass_sv.pkl')


  result = getattr(ufunc, method)(*inputs, **kwargs)


## MVP

In [67]:
# Identify present phenotypes
mp = pd.read_excel('./MVP/mvp_phenotypes.xlsx')
mp = mp.loc[mp['Category'] == 'PheCodes']
mp['pheno'] = mp['Trait'].str.replace('Phe_','').str.replace('_','.')
mp['pheno'] = pd.to_numeric(mp['pheno'], errors='coerce')
mp = mp[['pheno']].dropna()
mp = pcc.merge(mp)[['Code']].drop_duplicates()
mp['MVP'] = 1
mp.to_csv('./Phenotyping/mvp_present.csv', index=False)


In [None]:
# Create eQTL features
pcc = pd.read_excel('./Phenotyping/phecode_to_code.xlsx').rename({'phecode1.2':'pheno'},axis=1)
df = pd.read_csv('./MVP/combined_eqtl_closest.tsv', sep='\t')
df['pheno'] = df['pheno'].str.replace('Phe_','')
df['pheno'] = df['pheno'].str.replace('_','.').astype(float)
df = df.merge(pcc)
df = df.groupby(['Code','gene'])[['opposite','same']].max().reset_index()
df = df.set_axis(['Code','gene','mvp_eqtl_closest_opposite','mvp_eqtl_closest_same'],axis=1)
df.to_pickle('./Features/mvp_eqtl_closest.pkl')


## PanUKBB

In [None]:
# Identify present phenotypes
pu = pd.read_excel('./PanUKBB/panukbb_pheno.xlsx', usecols = ['trait_type','phenocode','description','aws_link'])
pu_icd = pu.loc[(pu['trait_type'] == 'icd10') & (pu['phenocode'].isin(merged_pheno['Code']))]
pu_icd['Code'] = pu_icd['phenocode'].copy()
pu_icd['pheno'] = pu_icd['trait_type'] + '-' + pu_icd['phenocode']
pu_icd = pu_icd[['pheno','Code']]
pu_p12 = pu.loc[pu['trait_type'] == 'phecode']
pcc = pd.read_excel('./Phenotyping/phecode_to_code.xlsx')
pu_p12['phecode1.2'] = pu_p12['phenocode'].astype(float)
pu_p12 = pu_p12.merge(pcc)
pu_p12['pheno'] = pu_p12['trait_type'] + '-' + pu_p12['phenocode'].astype(str)
pu_p12 = pu_p12[['pheno','Code']]
pu = pd.concat([pu_icd,pu_p12]).drop_duplicates()

d = pu[['Code']].drop_duplicates()
d['PanUKBB'] = 1
d.to_csv('./Phenotyping/panukbb_present.csv', index=False)


In [None]:
# Create eQTL features
df = pd.read_csv('./PanUKBB/combined_eqtl_closest.tsv', sep='\t')
df = df.merge(pu)
df = df.groupby(['Code','gene'])[['opposite','same']].max().reset_index()
df = df.set_axis(['Code','gene','panukbb_eqtl_closest_opposite','panukbb_eqtl_closest_same'],axis=1)
df.to_pickle('./Features/panukbb_eqtl_closest.pkl')


## Jurgens et al. rare variant analysis

In [74]:
# Identify present phenotypes
results = pd.read_pickle('./Jurgens/UKB_AoU_MGB_combined.pkl')
pcc = pd.read_excel('./Phenotyping/phecode_to_code.xlsx')
jp = pd.read_excel('./Jurgens/Jurgens.xlsx').rename({'PheCode':'phecode1.2'},axis=1)
jp['phecode1.2'] = jp['phecode1.2'].astype(float)
jp = jp.merge(pcc)[['Meaning','Code']].set_axis(['phenotype','Code'],axis=1).drop_duplicates()

d = jp[['Code']].drop_duplicates()
d['Jurgens'] = 1
d.to_csv('./Phenotyping/jurgens_present.csv', index=False)


In [52]:
# Create gene burden features
results = results.merge(jp)
results = results.drop('phenotype',axis=1)
results['mask'] = results['mask'].map({'lof_ms0.5_af0.01':'jurgens_lof_0.5missense',
                                       'lof_ms0.5_af0.001':'jurgens_lof_0.5missense',
                                       'lof_ms0.5_af0.00001':'jurgens_lof_0.5missense',
                                       'ms0.2_af0.00001':'jurgens_0.2missense',
                                       'lof_ms0.8_af0.01':'jurgens_lof_0.8missense',
                                       'lof_ms0.8_af0.001':'jurgens_lof_0.8missense',
                                       'lof_af0.01':'jurgens_lof',
                                       'lof_af0.001':'jurgens_lof',
                                       'ms0.5_af0.00001':'jurgens_0.5missense'})
results = results.sort_values('p', ascending=False).drop_duplicates(['gene','Code','mask'])
results.loc[results['beta'] < 0, 'p'] = results['p']*-1
results = pd.pivot_table(results, index=['gene','Code'], columns='mask', values='p').reset_index()
results.to_pickle('./Features/jurgens.pkl')


## Finngen

In [12]:
# Identify present phenotypes
fgm = pd.read_csv('./Finngen/finngen_R12_manifest.tsv', sep='\t')
fg = pd.read_excel('./Finngen/finngen_R12_endpoints.xlsx')[['LEVEL','NAME','LONGNAME','HD_ICD_10','COD_ICD_10','version','PARENT']]
fg = fg.loc[fg['NAME'].isin(fgm['phenocode'])]
fg['ICD'] = fg['HD_ICD_10'].str.replace('.','')

a = fg.loc[fg['ICD'].isin(merged_pheno['Code'])]
a = a[['NAME','ICD']].set_axis(['pheno','Code'],axis=1)

b = fg.merge(custom[['finngen','custom_code']].dropna().drop_duplicates().rename({'finngen':'LONGNAME'},axis=1))
b = b[['LONGNAME','custom_code']].set_axis(['pheno','Code'],axis=1)

c = fg.loc[(~fg['ICD'].isin(merged_pheno['Code'])) & (fg['HD_ICD_10'].notna())]
c['ICD_3c'] = c['ICD'].str[:3]
c = c.loc[c['ICD_3c'].isin(merged_pheno['Code'])]
c_inc = pd.read_excel('./Finngen/bar_pheno.xlsx')
c_inc = c_inc.loc[c_inc['Include'] == 'Y']['NAME']
c = c.loc[(~c['HD_ICD_10'].str.contains('|', regex=False)) | (c['NAME'].isin(c_inc))]
c = c[['NAME','ICD_3c']].set_axis(['pheno','Code'],axis=1)

fg = pd.concat([a,b,c]).drop_duplicates()

d = fg[['Code']].drop_duplicates()
d['FinnGen'] = 1
d.to_csv('./Phenotyping/finngen_present.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c['ICD_3c'] = c['ICD'].str[:3]


In [None]:
# Create eQTL features
te = pd.DataFrame(columns=['Code','gene'])

for tissue in ['Adipose_Subcutaneous','Whole_Blood','Lung','Liver','Brain_Cortex',
               'Heart_Left_Ventricle','Kidney_Cortex','Muscle_Skeletal','Artery_Tibial','Cells_Cultured_fibroblasts']:
    df = pd.read_csv(f'./Finngen/combined_eqtl_{tissue}_closest.tsv', sep='\t')
    df = df.merge(fg)
    df = df.groupby(['Code','gene'])[['opposite','same']].max().reset_index()
    df = df.set_axis(['Code','gene','finngen_eqtl_'+tissue+'_closest_opposite','finngen_eqtl_'+tissue+'_closest_same'],axis=1)
    te = te.merge(df, on=['Code','gene'], how='outer')

te.to_pickle('./Features/finngen_eqtl_tissue_closest.pkl')


In [None]:
# Create rare variant features
df = pd.read_csv('./Finngen/rare_missense_lof.tsv', sep='\t')
vep = pd.read_pickle('./Finngen/Annotations/vep_cleaned.pkl')
vep = vep[['ID','SYMBOL','annotation']].set_axis(['id','gene','annotation'],axis=1).dropna()
df = df.merge(vep).merge(fg)
df = df[['id','beta','log10p','gene','annotation','Code']]
df = df.sort_values('log10p', ascending=False).drop_duplicates(['Code','gene','annotation'])
df.loc[df['beta'] < 0, 'log10p'] = df['log10p']*-1
df = pd.pivot_table(df, index=['Code','gene'], columns='annotation', values='log10p').reset_index()
df = df.set_axis(['Code','gene','finngen_sv_hclof','finngen_sv_lclof','finngen_sv_gof',
                  'finngen_sv_other_0.5missense','finngen_sv_other_missense'],axis=1)
df.to_pickle('./Features/finngen_sv.pkl')


In [2]:
# Create gene burden features
fl = pd.read_csv('./Finngen/LOF/finngen_R12_lof.txt', sep='\t')
fl = fl.drop(['CHROM','GENPOS','ALLELE0','SE','CHISQ','EXTRA'],axis=1)
fl['MAC'] = fl['A1FREQ']*fl['N']*2
fl = fl.loc[fl['LOG10P'] >= -np.log10(0.05)]
fl = fl.loc[fl['MAC'] >= 10]
fl['gene'] = fl['ID'].str.split('.').str[0]
fl = fl.rename({'PHENO':'pheno'},axis=1).merge(fg)
fl = fl.sort_values('LOG10P',ascending=False).drop_duplicates(['Code','gene'])
fl.loc[fl['BETA'] < 0, 'LOG10P'] = fl['LOG10P']*-1
fl = fl[['Code','gene','LOG10P']].rename({'LOG10P':'finngen_gb_lof'},axis=1)
fl.to_pickle('./Features/finngen_gb.pkl')
