This notebook details the parsing of the mouse gene set from Mouse Genome Informatics (MGI).

In [1]:
import pandas as pd

In [2]:
mouse_file = '../files/MGI_PhenoGenoMP_Jan16_2020.rpt'

In [3]:
# read in mouse file and add appropriate columns 
# see MGI_PhenoGenoMP.rpt in 
#http://www.informatics.jax.org/downloads/reports/index.html

mouse = pd.read_csv(mouse_file,header=None,sep='\t')
mouse.columns = ["Allelic_Composition", "AlleleSymbol",
                 "Genetic_Background","Mammalian_Phenotype_ID",
                 "PubMed_ID","MGI_Marker_Accession_ID"]
mouse.head()

Unnamed: 0,Allelic_Composition,AlleleSymbol,Genetic_Background,Mammalian_Phenotype_ID,PubMed_ID,MGI_Marker_Accession_ID
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874


In [79]:
len(mouse["MGI_Marker_Accession_ID"].unique())

TypeError: unhashable type: 'list'

In [4]:
#number of rows in table
len(mouse)

326735

In [5]:
#number of unique mammalian phenotype IDs
len(mouse["Mammalian_Phenotype_ID"].unique())

10302

In [6]:
# some rows have more than one MGI_Marker_Accession_ID
# explode columns

mouse["MGI_Marker_Accession_ID"] = mouse["MGI_Marker_Accession_ID"].apply(lambda x: x.split(","))
mouse_mgi_ids = mouse[["Mammalian_Phenotype_ID","MGI_Marker_Accession_ID"]].copy()
mouse_mgi_ids = mouse_mgi_ids.explode("MGI_Marker_Accession_ID")

In [7]:
len(mouse_mgi_ids)

412443

In [8]:
#from quick visualization, we see that the MGI_Marker_Accession_ID is not unique
# mouse_mgi_ids_parsed
mouse_mgi_ids = mouse_mgi_ids.drop_duplicates("MGI_Marker_Accession_ID")
mouse_mgi_ids["MGI_Marker_Accession_ID"].to_csv("MGI_ID_unique_Jan16_2020.txt",
                                                       index=None, header=False)
len(mouse_mgi_ids)

20719

In [19]:
# obtained table 
# http://www.informatics.jax.org/batch
mgi_batch_report = pd.read_csv("../files/MGIBatchReport_20200116_133654.txt",sep="\t")
print(len(mgi_batch_report))
mgi_batch_report_na = mgi_batch_report.loc[mgi_batch_report["MP ID"].isna()]
print(len(mgi_batch_report_na))
mgi_batch_report_na.head()


230895
1735


Unnamed: 0,Input,Input Type,MGI Gene/Marker ID,Symbol,Name,Feature Type,Ensembl ID,Entrez Gene ID,MP ID,Term
20358,MGI:1346096,MGI,MGI:1346096,Estq3,estradiol regulated response QTL 3,QTL,,112297.0,,
20359,MGI:1346095,MGI,MGI:1346095,Estq2,estradiol regulated response QTL 2,QTL,,109458.0,,
20657,MGI:1346081,MGI,MGI:1346081,Estq4,estradiol regulated response QTL 4,QTL,,112296.0,,
20658,MGI:1346097,MGI,MGI:1346097,Estq1,estradiol regulated response QTL 1,QTL,,109459.0,,
23712,MGI:96947,MGI,MGI:96947,Mdac,modifier of Dac,heritable phenotypic marker,,17238.0,,


In [66]:
mgi_batch_report_parsed = mgi_batch_report[["MP ID","Term", "MGI Gene/Marker ID"]].copy()
mgi_batch_report_parsed["Term"] = mgi_batch_report_parsed["Term"].str.replace(" ","_")
mgi_batch_report_parsed["pathway"] = mgi_batch_report_parsed["MP ID"]+"_"+mgi_batch_report_parsed["Term"]
mgi_batch_report_parsed = mgi_batch_report_parsed.sort_values("pathway")
mgi_batch_report_parsed.columns = ["mp","term","mgi", "geneset"]
mgi_batch_report_parsed["database"] = "MGI"
mgi_batch_report_parsed = mgi_batch_report_parsed[["database","geneset", "mgi", "mp"]]

mgi_batch_report_parsed = mgi_batch_report_parsed.dropna()
print(len(mgi_batch_report_parsed))
mgi_batch_report_parsed.head()

229160


Unnamed: 0,database,geneset,mgi,mp
83736,MGI,MP:0000003_abnormal_adipose_tissue_morphology,MGI:1349717,MP:0000003
103676,MGI,MP:0000003_abnormal_adipose_tissue_morphology,MGI:101884,MP:0000003
40413,MGI,MP:0000003_abnormal_adipose_tissue_morphology,MGI:895149,MP:0000003
16235,MGI,MP:0000003_abnormal_adipose_tissue_morphology,MGI:1099809,MP:0000003
188573,MGI,MP:0000003_abnormal_adipose_tissue_morphology,MGI:2139806,MP:0000003


In [67]:
human = pd.read_csv("../files/HMD_HumanPhenotype_Jan16_2020.rpt", 
                    sep="\t", header=None,
                    names=['gene','entrez','entrez_2','yes/no','mouse_gene','mgi','mp','other']
                   )
human = human.drop(["other", "mp"], axis=1)
human["mgi"] = human["mgi"].str.replace(" ", "")
human.head()

Unnamed: 0,gene,entrez,entrez_2,yes/no,mouse_gene,mgi
0,A1BG,1,11167.0,yes,A1bg,MGI:2152878
1,A1CF,29974,16363.0,yes,A1cf,MGI:1917115
2,A2M,2,37248.0,yes,A2m,MGI:2449119
3,A3GALT2,127550,16326.0,yes,A3galt2,MGI:2685279
4,A4GALT,53947,9690.0,yes,A4galt,MGI:3512453


In [68]:
human_mouse = pd.merge(human,mgi_batch_report_parsed,on=['mgi'])
human_mouse

Unnamed: 0,gene,entrez,entrez_2,yes/no,mouse_gene,mgi,database,geneset,mp
0,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,MGI,MP:0000352_decreased_cell_proliferation,MP:0000352
1,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,MGI,MP:0002795_dilated_cardiomyopathy,MP:0002795
2,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,MGI,MP:0005140_decreased_cardiac_muscle_contractility,MP:0005140
3,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,MGI,MP:0005329_abnormal_myocardium_layer_morphology,MP:0005329
4,A1CF,29974,16363.0,yes,A1cf,MGI:1917115,MGI,MP:0011094_embryonic_lethality_before_implanta...,MP:0011094
...,...,...,...,...,...,...,...,...,...
194859,ZZEF1,23140,9027.0,yes,Zzef1,MGI:2444286,MGI,MP:0010025_decreased_total_body_fat_amount,MP:0010025
194860,ZZEF1,23140,9027.0,yes,Zzef1,MGI:2444286,MGI,MP:0010053_decreased_grip_strength,MP:0010053
194861,ZZEF1,23140,9027.0,yes,Zzef1,MGI:2444286,MGI,MP:0011396_abnormal_sleep_behavior,MP:0011396
194862,ZZEF1,23140,9027.0,yes,Zzef1,MGI:2444286,MGI,MP:0030610_absent_teeth,MP:0030610


In [72]:
human_mouse = human_mouse[['gene','entrez','mgi','geneset']]
human_mouse.columns = ["gene_symbol", "EntrezID", "mgi", "geneset"]
human_mouse.head()

Unnamed: 0,gene_symbol,EntrezID,mgi,geneset
0,A1CF,29974,MGI:1917115,MP:0000352_decreased_cell_proliferation
1,A1CF,29974,MGI:1917115,MP:0002795_dilated_cardiomyopathy
2,A1CF,29974,MGI:1917115,MP:0005140_decreased_cardiac_muscle_contractility
3,A1CF,29974,MGI:1917115,MP:0005329_abnormal_myocardium_layer_morphology
4,A1CF,29974,MGI:1917115,MP:0011094_embryonic_lethality_before_implanta...


In [73]:
len(human_mouse)

194864

In [74]:
mapping_df = pd.read_csv("/data/Segre_Lab/scripts/GeneEnrich/data/entrez_ensembl_mapping_v26_16Jun_2022.txt", sep="\t")
mapping_df.head()

Unnamed: 0,EntrezID,ensembl_gene_id,gene_symbol
0,1,ENSG00000121410,A1BG
1,2,ENSG00000175899,A2M
2,3,ENSG00000256069,A2MP1
3,9,ENSG00000171428,NAT1
4,10,ENSG00000156006,NAT2


In [75]:
x = pd.merge(human_mouse, mapping_df, on=["EntrezID", "gene_symbol"])

In [76]:
x

Unnamed: 0,gene_symbol,EntrezID,mgi,geneset,ensembl_gene_id
0,A1CF,29974,MGI:1917115,MP:0000352_decreased_cell_proliferation,ENSG00000148584
1,A1CF,29974,MGI:1917115,MP:0002795_dilated_cardiomyopathy,ENSG00000148584
2,A1CF,29974,MGI:1917115,MP:0005140_decreased_cardiac_muscle_contractility,ENSG00000148584
3,A1CF,29974,MGI:1917115,MP:0005329_abnormal_myocardium_layer_morphology,ENSG00000148584
4,A1CF,29974,MGI:1917115,MP:0011094_embryonic_lethality_before_implanta...,ENSG00000148584
...,...,...,...,...,...
192341,ZZEF1,23140,MGI:2444286,MP:0010025_decreased_total_body_fat_amount,ENSG00000074755
192342,ZZEF1,23140,MGI:2444286,MP:0010053_decreased_grip_strength,ENSG00000074755
192343,ZZEF1,23140,MGI:2444286,MP:0011396_abnormal_sleep_behavior,ENSG00000074755
192344,ZZEF1,23140,MGI:2444286,MP:0030610_absent_teeth,ENSG00000074755


In [77]:
len(x["geneset"].unique())

9970

In [80]:
x = x.loc[x["mgi"].isin(mouse_mgi_ids["MGI_Marker_Accession_ID"])]

In [81]:
len(x)

192346

In [82]:
len(x["geneset"].unique())

9970