In [None]:
import pandas as pd
import os
import glob
import numpy as np

## Extracting relevant columns

In [None]:
# Download GTEx_Analysis_v10_eQTL.tar from https://www.gtexportal.org/home/downloads/adult-gtex/qtl and extract

# Cleaning raw eQTL data from GTEx v10
input_dir = './eQTL/Raw/'
output_dir = './eQTL/Processed/'

for file in os.listdir(input_dir):
    if file.endswith('.parquet'):
        tissue_name = file.replace('.v10.eQTLs.signif_pairs.parquet', '')
        df = pd.read_parquet(os.path.join(input_dir, file))
        df = df[['gene_id', 'variant_id', 'tss_distance', 'pval_nominal', 'slope']]
        df['tissue'] = tissue_name
        output_path = os.path.join(output_dir, f"{tissue_name}.parquet")
        df.to_parquet(output_path)

# Combining into a single file
input_dir = "./eQTL/Processed"
parquet_files = glob.glob(f"{input_dir}/*.parquet")
df_list = [pd.read_parquet(pf) for pf in parquet_files]
combined_df = pd.concat(df_list, ignore_index=True)
combined_df.to_parquet("./eQTL/combined.parquet")

# Select protein coding genes
targets = pd.read_pickle('./OT/Raw/targets.pkl')
targets = targets.loc[targets['biotype'] == 'protein_coding']
targets = targets[['id','approvedSymbol']].set_axis(['gene_id','gene'],axis=1)

df = pd.read_parquet("./eQTL/combined.parquet")
df['gene_id'] = df['gene_id'].str.split('.').str[0]
df = df.loc[df['gene_id'].isin(targets['gene_id'])]
df['variant_id'] = df['variant_id'].str.replace('_b38','')
df['variant_id'] = df['variant_id'].str.replace('chr','')
df = df.merge(targets)
df = df[['gene','variant_id','tss_distance','pval_nominal','slope','tissue']]
df['pval_nominal'] = -np.log10(df['pval_nominal'])
df.loc[df['pval_nominal'] > 500, 'pval_nominal'] = 500
df.to_parquet("./eQTL/combined.parquet")

df = df.sort_values('pval_nominal', ascending=False)
df = df.drop_duplicates(['gene','variant_id','tissue'])
df.loc[df['slope'] >= 0, 'slope'] = 1
df.loc[df['slope'] < 0, 'slope'] = -1
df['pval_nominal'] = df['pval_nominal'].round(4)
df = df.rename({'pval_nominal':'eqtl_log10p'},axis=1)
df.to_parquet("./eQTL/combined_slim.parquet")


## Creating subsetted files for each dataset

In [None]:
df = pd.read_parquet("./eQTL/combined.parquet")

# Most significant tissue AND closest gene
at = df.drop_duplicates(['gene','variant_id'])
at['tss_distance'] = at['tss_distance'].abs()
at = at.sort_values(['tss_distance','eqtl_log10p'], ascending=[True,False]).drop_duplicates(['variant_id'])

at = at[['gene','variant_id','slope']]

flip = at.copy()
flip['slope'] = flip['slope']*-1
flip['variant_id']=flip['variant_id'].apply(lambda x:'_'.join(x.split('_')[:-2]+x.split('_')[-1:-3:-1]))

at = pd.concat([at,flip])
at = at.reset_index(drop=True).drop_duplicates()
at.to_parquet('./eQTL/all_tissues_max_p_closest_gene.parquet')

#####

at = pd.read_parquet('./eQTL/all_tissues_max_p_closest_gene.parquet')

fg = pd.read_csv(f'./Finngen/Annotations/common_var.tsv', sep='\t')
fg['variant_id'] = fg['id'].str.replace(r'^23:', 'X:', regex=True).str.replace(':','_')
fg = fg.merge(at)
fg[['id','gene','slope']].to_pickle('./eQTL/finngen_eqtl_closest_gene.pkl')

ukb = pd.read_pickle('./PanUKBB/variant_conv.pkl')
ukb = ukb.rename({'id_38':'variant_id'},axis=1)
ukb = ukb.merge(at)
ukb[['id_19','gene','slope']].to_pickle('./eQTL/panukbb_eqtl_closest_gene.pkl')

mvp = pd.read_csv('./MVP/unique_variants.txt', sep='\s+')
mvp['pos'] = mvp['pos'].astype(int)
mvp['variant_id'] = mvp['chrom'].astype(str) + '_' + mvp['pos'].astype(str) + '_' + mvp['ref'] + '_' + mvp['alt']
mvp = mvp.merge(at)
mvp = mvp[['variant_id','gene','slope']]
mvp.to_pickle('./eQTL/mvp_eqtl_closest_gene.pkl')

## Processing MVP

In [None]:
mvp = pd.read_pickle('./eQTL/mvp_eqtl_closest_gene.pkl')

for f in glob.glob('./MVP/Cleaned/*.txt'):
    name = f.split('/')[-1].split('.txt')[0]
    temp = pd.read_csv(f, sep=' ')
    temp['pos'] = temp['pos'].astype(int)
    temp['variant_id'] = temp['chrom'].astype(str) + '_' + temp['pos'].astype(str) + '_' + temp['ref'] + '_' + temp['alt']
    temp['pval'] = -np.log10(temp['pval'])
    temp = temp.merge(mvp)
    temp['dir'] = 'opposite'
    temp['beta'] = np.log10(temp['or'])
    temp.loc[temp['beta']/temp['slope']>=0,'dir']='same'
    temp = temp.groupby(['gene','dir'],as_index=False)['pval'].max()
    temp = temp.pivot(index='gene', columns='dir', values='pval').reset_index()
    temp['pheno']=name
    temp.to_csv(f'./MVP/eQTL/{name}.eqtl_closest',sep='\t',index=False)

pd.concat([pd.read_csv(f,sep='\t') for f in glob.glob('./MVP/eQTL/*.eqtl_closest')],ignore_index=True).to_csv('./MVP/combined_eqtl_closest.tsv',sep='\t',index=False)


## Processing PanUKBB

In [None]:
ukb = pd.read_pickle('./eQTL/panukbb_eqtl_closest_gene.pkl').rename({'id_19':'id'},axis=1)

for f in glob.glob('./PanUKBB/Subset/*.tsv'):
    name = f.split('/')[-1].split('.tsv')[0]
    name = name.replace('-both_sexes','').replace('-females','').replace('-males','')
    temp = pd.read_csv(f, sep='\t', usecols=['id','neglog10_pval','beta'])
    temp = temp.merge(ukb, on='id')
    temp['dir'] = 'opposite'
    temp.loc[temp['beta']/temp['slope']>=0,'dir']='same'
    temp = temp.groupby(['gene','dir'],as_index=False)['neglog10_pval'].max()
    temp = temp.pivot(index='gene', columns='dir', values='neglog10_pval').reset_index()
    temp['pheno']=name
    temp.to_csv(f'./PanUKBB/eQTL/{name}.eqtl_closest',sep='\t',index=False)

pd.concat([pd.read_csv(f,sep='\t') for f in glob.glob('./PanUKBB/eQTL/*.eqtl_closest')],ignore_index=True).to_csv('./PanUKBB/combined_eqtl_closest.tsv',sep='\t',index=False)


## Processing FinnGen

In [None]:
fg = pd.read_pickle('./eQTL/finngen_eqtl_closest_gene.pkl')

for f in glob.glob('./Finngen/Common/finngen_R12_*.sig'):
    pheno = f.split('finngen_R12_')[1].split('.sig')[0]
    temp = pd.read_csv(f, sep='\t', usecols=['id','log10p','beta'])
    temp = temp.merge(fg, on='id')
    temp['dir'] = 'opposite'
    temp.loc[temp['beta']/temp['slope']>=0,'dir']='same'
    temp = temp.groupby(['gene','dir'],as_index=False)['log10p'].max()
    temp = temp.pivot(index='gene', columns='dir', values='log10p').reset_index()
    temp['pheno']=pheno
    temp.to_csv(f'./Finngen/eQTL/{pheno}.eqtl_closest',sep='\t',index=False)

pd.concat([pd.read_csv(f,sep='\t') for f in glob.glob('./Finngen/eQTL/*.eqtl_closest')],ignore_index=True).to_csv('./Finngen/combined_eqtl_closest.tsv',sep='\t',index=False)
