In [1]:
import pandas as pd

In [2]:
# Load LLT file, file names taken from description file
llt_columns = [
    "llt_code", "llt_name", "pt_code", "whoart_code", "harts_code", "costart_sym",
    "icd9_code", "icd9cm_code", "icd10_code", "jart_code", "current_flag", "llt_rec_code"
]

llt_df = pd.read_csv("MedDRA_28_0_ENglish/MedAscii/llt.asc", sep="$", header=None, names=llt_columns, dtype=str, encoding="latin1")
llt_df = llt_df[["llt_code", "llt_name", "pt_code"]].dropna()

llt_df.head()

Unnamed: 0,llt_code,llt_name,pt_code
0,10000001,Ventilation pneumonitis,10081988
1,10000002,11-beta-hydroxylase deficiency,10000002
2,10000003,11-oxysteroid activity incr,10033315
3,10000004,11-oxysteroid activity increased,10033315
4,10000005,17 ketosteroids urine,10000005


In [3]:
# Load soc.asc 
soc_columns = [
    "soc_code", "soc_name", "soc_abbrev", "soc_whoart_code", "soc_harts_code",
    "soc_costart_sym", "soc_icd9_code", "soc_icd9cm_code", "soc_icd10_code",
    "soc_jart_code", "trailer"
]

soc_df = pd.read_csv(
    "MedDRA_28_0_ENglish/MedAscii/soc.asc",
    sep="$",
    header=None,
    names=soc_columns,
    dtype=str,
    encoding="latin1"
)

soc_df = soc_df[["soc_code", "soc_name", "soc_abbrev"]]
soc_df.head()


Unnamed: 0,soc_code,soc_name,soc_abbrev
0,10005329,Blood and lymphatic system disorders,Blood
1,10007541,Cardiac disorders,Card
2,10010331,"Congenital, familial and genetic disorders",Cong
3,10013993,Ear and labyrinth disorders,Ear
4,10014698,Endocrine disorders,Endo


In [4]:
mdh_cols = [
    "pt_code", "hlt_code", "hlgt_code", "soc_code",
    "pt_name", "hlt_name", "hlgt_name", "soc_name",
    "soc_abbrev", "null_field", "pt_soc_code", "primary_soc_fg", "trailer"
]

mdh = pd.read_csv("MedDRA_28_0_ENglish/MedAscii/mdhier.asc", sep="$", names=mdh_cols, dtype=str, encoding="latin1")


#Filter out the rows where primary_soc_fg is not "Y" since we are only interested in the primary SOC?
mdh = mdh[mdh["primary_soc_fg"] == "Y"]

mdh.head()

Unnamed: 0,pt_code,hlt_code,hlgt_code,soc_code,pt_name,hlt_name,hlgt_name,soc_name,soc_abbrev,null_field,pt_soc_code,primary_soc_fg,trailer
0,10002043,10002042,10002086,10005329,Anaemia folate deficiency,Anaemia deficiencies,Anaemias nonhaemolytic and marrow depression,Blood and lymphatic system disorders,Blood,,10005329,Y,
1,10002080,10002042,10002086,10005329,Anaemia vitamin B12 deficiency,Anaemia deficiencies,Anaemias nonhaemolytic and marrow depression,Blood and lymphatic system disorders,Blood,,10005329,Y,
2,10002081,10002042,10002086,10005329,Anaemia vitamin B6 deficiency,Anaemia deficiencies,Anaemias nonhaemolytic and marrow depression,Blood and lymphatic system disorders,Blood,,10005329,Y,
3,10022972,10002042,10002086,10005329,Iron deficiency anaemia,Anaemia deficiencies,Anaemias nonhaemolytic and marrow depression,Blood and lymphatic system disorders,Blood,,10005329,Y,
4,10034695,10002042,10002086,10005329,Pernicious anaemia,Anaemia deficiencies,Anaemias nonhaemolytic and marrow depression,Blood and lymphatic system disorders,Blood,,10005329,Y,


In [5]:
llt_soc = llt_df.merge(mdh, on="pt_code", how="left")
final_df = llt_soc.merge(soc_df, on="soc_code", how="left")


In [58]:
final_df

Unnamed: 0,llt_code,llt_name,pt_code,hlt_code,hlgt_code,soc_code,pt_name,hlt_name,hlgt_name,soc_name_x,soc_abbrev_x,null_field,pt_soc_code,primary_soc_fg,trailer,soc_name_y,soc_abbrev_y
0,10000001,Ventilation pneumonitis,10081988,10024972,10024967,10038738,Hypersensitivity pneumonitis,Lower respiratory tract inflammatory and immun...,Lower respiratory tract disorders (excl obstru...,"Respiratory, thoracic and mediastinal disorders",Resp,,10038738,Y,,"Respiratory, thoracic and mediastinal disorders",Resp
1,10000002,11-beta-hydroxylase deficiency,10000002,10021608,10027424,10010331,11-beta-hydroxylase deficiency,Inborn errors of steroid synthesis,Metabolic and nutritional disorders congenital,"Congenital, familial and genetic disorders",Cong,,10010331,Y,,"Congenital, familial and genetic disorders",Cong
2,10000003,11-oxysteroid activity incr,10033315,10001339,10014706,10022891,Oxycorticosteroids increased,Adrenal cortex tests,Endocrine investigations (incl sex hormones),Investigations,Inv,,10022891,Y,,Investigations,Inv
3,10000004,11-oxysteroid activity increased,10033315,10001339,10014706,10022891,Oxycorticosteroids increased,Adrenal cortex tests,Endocrine investigations (incl sex hormones),Investigations,Inv,,10022891,Y,,Investigations,Inv
4,10000005,17 ketosteroids urine,10000005,10038589,10014706,10022891,17 ketosteroids urine,Reproductive hormone analyses,Endocrine investigations (incl sex hormones),Investigations,Inv,,10022891,Y,,Investigations,Inv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89769,10092088,Out of specification result not investigated,10092086,10091809,10077537,10077536,Improper management of out of specification re...,Quality system issues,"Product quality, supply, distribution, manufac...",Product issues,Prod,,10077536,Y,,Product issues,Prod
89770,10092089,Missing batch production record,10092085,10091809,10077537,10077536,Inappropriate batch production records,Quality system issues,"Product quality, supply, distribution, manufac...",Product issues,Prod,,10077536,Y,,Product issues,Prod
89771,10092090,Out of specification result invalidated withou...,10092086,10091809,10077537,10077536,Improper management of out of specification re...,Quality system issues,"Product quality, supply, distribution, manufac...",Product issues,Prod,,10077536,Y,,Product issues,Prod
89772,10092091,Out of specification result invalidated withou...,10092086,10091809,10077537,10077536,Improper management of out of specification re...,Quality system issues,"Product quality, supply, distribution, manufac...",Product issues,Prod,,10077536,Y,,Product issues,Prod


In [57]:
final_df.to_csv("llt_soc.csv", index=False)

In [9]:
# Create a map from LLT to SOC
llt_to_soc = final_df.set_index("llt_code")["soc_code"].to_dict()
llt_to_soc

{'10000001': '10038738',
 '10000002': '10010331',
 '10000003': '10022891',
 '10000004': '10022891',
 '10000005': '10022891',
 '10000006': '10022891',
 '10000007': '10022891',
 '10000008': '10022891',
 '10000009': '10022891',
 '10000010': '10022891',
 '10000011': '10022891',
 '10000012': '10022891',
 '10000013': '10010331',
 '10000014': '10010331',
 '10000015': '10022891',
 '10000016': '10022891',
 '10000017': '10022891',
 '10000018': '10007541',
 '10000019': '10038604',
 '10000020': '10010331',
 '10000021': '10010331',
 '10000022': '10007541',
 '10000023': '10038604',
 '10000024': '10022891',
 '10000025': '10022891',
 '10000026': '10022891',
 '10000027': '10038604',
 '10000028': '10022891',
 '10000029': '10010331',
 '10000030': '10042613',
 '10000031': '10029104',
 '10000033': '10027433',
 '10000034': '10007541',
 '10000035': '10010331',
 '10000036': '10038738',
 '10000037': '10022891',
 '10000038': '10022891',
 '10000039': '10017947',
 '10000040': '10017947',
 '10000041': '10017947',


In [8]:
import pickle
# Save the map to a pickle file
with open("llt_to_soc.pkl", "wb") as f:
    pickle.dump(llt_to_soc, f)