In [117]:
from mygene import MyGeneInfo
import pandas as pd
DATA_DIR = "/orcd/pool/003/dbertsim_shared/ukb/"

## Protein to BED

In [100]:
outcome = "prostate_cancer"

feats = pd.read_csv(f"features/{outcome}/top300_features_proteomics.csv")
proteins = list(feats['Feature'])
proteins = [x.replace('olink_', '') for x in proteins]

In [121]:
# write for all proteins
df  = pd.read_csv('/orcd/pool/003/dbertsim_shared/ukb/ukb_cancer_test.csv')
olink_cols = [c for c in df.columns if "olink" in c]
proteins = [x.replace('olink_', '') for x in olink_cols]

  df  = pd.read_csv('/orcd/pool/003/dbertsim_shared/ukb/ukb_cancer_test.csv')


In [122]:
# -----------------------------
# 1. INPUT: HUGO / HGNC symbols
# -----------------------------
hugo_genes = proteins   # Filter to standard chromosomes if you want
valid_chrs = ['18']
# valid_chrs = [str(i) for i in range(1, 23)] + ["X", "Y", "MT", "M"]

mg = MyGeneInfo()

# -----------------------------
# 2. Query mygene.info
#    - scopes='symbol' : we query by gene symbol
#    - fields: we ask for genomic coordinates and identifiers
# -----------------------------
res = mg.querymany(
    hugo_genes,
    scopes="symbol",
    fields="symbol,genomic_pos.chr,genomic_pos.start, genomic_pos.end", # got rid of strand field
    species="human"
)

# Convert to DataFrame for easier manipulation
df = pd.DataFrame(res)
df = df[df["genomic_pos"].notnull()]

# Handle possible list-valued fields (ensembl.gene can be list)
def _first(x):
    if isinstance(x, dict) and len(x) > 0:
        return list(x.values())
    return x

df["genomic_pos"] = df["genomic_pos"].apply(
    lambda x: x if isinstance(x, list) else [x]
)
df = df.explode("genomic_pos").reset_index(drop=True)
pos_df = pd.json_normalize(df['genomic_pos'])
df = pd.concat([df.drop(columns=["genomic_pos"]), pos_df], axis=1)

print(f"Number of genes before selecting for chromosomes: {len(df)}")
df = df[df["chr"].isin(valid_chrs)]
print(f"Number of genes after selecting for chromosomes: {len(df)}")

# -----------------------------
# 3. Build BED DataFrame (0-based start)
# -----------------------------
bed = pd.DataFrame({
    "chr": "chr" + df["chr"].astype(str),
    "start": df["start"].astype(int) - 1,  # 0-based
    "end": df["end"].astype(int),
    "name": df["symbol"],
    # "score": 0,
    # "strand": df["strand"]
})

# Optional: deduplicate per gene symbol
print(f"Number of genes before dropping duplicates: {len(bed)}")
bed = bed.drop_duplicates()
print(f"Number of genes after dropping duplicates: {len(bed)}")


# -----------------------------
# 4. Write BED file
# -----------------------------
# out_file = f"hugo_genes_GRCh38_genes_{outcome}.bed"
out_file = f"features/hugo_genes_GRCh38_ch18.bed"
bed.to_csv(out_file, sep="\t", header=False, index=False)

print("BED file written to:", out_file)


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
1 input query terms found dup hits:	[('siglec5', 2)]
45 input query terms found no hit:	['amy1a_amy1b_amy1c', 'anp32c', 'arntl', 'bap18', 'bola2_bola2b', 'btnl10', 'c7orf50', 'cenpj', 'cer


Number of genes before selecting for chromosomes: 3480
Number of genes after selecting for chromosomes: 34
Number of genes before dropping duplicates: 34
Number of genes after dropping duplicates: 34
BED file written to: features/hugo_genes_GRCh38_ch18.bed


In [115]:
bed['chr'].value_counts()

chr1     330
chr19    230
chr2     197
chr11    178
chr17    168
chr12    152
chr6     149
chr3     135
chr7     128
chr4     127
chr5     127
chr9     119
chr16    115
chr10    110
chr8     102
chr14     91
chrX      88
chr20     77
chr22     74
chr15     73
chr13     47
chr18     34
chr21     29
chrY       3
Name: chr, dtype: int64

In [111]:
len(bed.loc[bed['chr'] == 'chr18'])

34

In [102]:
bed # prostate

Unnamed: 0,chr,start,end,name
145,chr18,31376776,31414912,DSG4
181,chr18,45983535,46072296,PSTPIP2
227,chr18,316736,500722,COLEC12
240,chr18,32185068,32220404,MEP1B
327,chr18,74534399,74587212,CNDP1


In [99]:
bed # breast

Unnamed: 0,chr,start,end,name
29,chr18,721587,813276,YES1
120,chr18,32185068,32220404,MEP1B
162,chr18,63970028,64019779,SERPINB8


In [96]:
bed # lung

Unnamed: 0,chr,start,end,name
156,chr18,31497623,31551351,DSG2
243,chr18,52340196,53535899,DCC
332,chr18,7566781,8406861,PTPRM


## Intersect Protein and SNPs

In [114]:
# run locally to get intersection

# bedtools intersect -a SNPs.bed -b protein.bed -wa -wb | awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4,$8}' > SNP_gene_overlaps.bed

## SNPs to RSID

In [151]:
outcome = "prostate"

df = pd.read_csv(f"bed/SNP_gene_overlaps_{outcome}_cancer.bed", sep="\t", header=None, \
                names=["chr", "start", "end", "alleles", "gene"])

In [152]:
# Split C>T into ref and alt
df[["ref", "alt"]] = df["alleles"].str.split(">", expand=True)

# Remove "chr" and use 1-based position (end) for the ID
df["rsid"] = (
    df["chr"].str.replace("chr", "", regex=False)
    + ":" + df["end"].astype(str)
    + ":" + df["ref"]
    + ":" + df["alt"]
)

variant_map = pd.read_csv(f"{DATA_DIR}/bgen/ch18/c18_b0_v1_variants.csv")
variant_map_filtered = variant_map.loc[variant_map['rsid'].isin(df['rsid'])]
variant_map_filtered.to_csv(f"bed/variants_overlaps_{outcome}_cancer.csv", index=False)

In [153]:
variant_map_filtered

Unnamed: 0,chrom,pos,rsid,allele_ids,variant_idx
3292,18,319951,18:319951:T:G,"T,G",3292
3293,18,319956,18:319956:C:T,"C,T",3293
3294,18,319970,18:319970:G:A,"G,A",3294
3295,18,319970,18:319970:G:T,"G,T",3295
3296,18,319973,18:319973:G:A,"G,A",3296
...,...,...,...,...,...
395762,18,74584653,18:74584653:T:C,"T,C",395762
395763,18,74584655,18:74584655:T:G,"T,G",395763
395764,18,74584656,18:74584656:G:A,"G,A",395764
395765,18,74584658,18:74584658:G:A,"G,A",395765
