# Mendelian non-syndromic and syndromic hearing loss genes contribute to presbycusis

## Description:

Create a table with the significant associations for the single and rare-variant summary statistics. 

## Create tables for rare-variant aggregate summary statistics

In [1]:
import pandas as pd

In [2]:
# Read the four files
df1 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/Haid_CADD_skato_0.005/f3393.regenie', comment='#', sep=" ")
df2 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/Hdiff_CADD_skato_0.005/f2247.regenie',comment='#', sep=" ")
df3 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/Hnoise_CADD_skato_0.005/f2257.regenie',comment='#', sep=" ")
df4 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/Hboth_CADD_skato_0.005/f2247_f2257.regenie',comment='#', sep=" ")

In [3]:
# Select required columns for each phenotype
df1_selected = df1[['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ', 'TEST', 'BETA', 'SE', 'LOG10P']]
df2_selected = df2[['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ', 'TEST', 'BETA', 'SE', 'LOG10P']]
df3_selected = df3[['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ', 'TEST', 'BETA', 'SE', 'LOG10P']]
df4_selected = df4[['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ', 'TEST', 'BETA', 'SE', 'LOG10P']]

In [4]:
# Rename columns to include the phenotype name
df1_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ_f3393','TEST', 'BETA_f3393', 'SE_f3393', 'LOG10P_f3393']
df2_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ_f2247','TEST', 'BETA_f2247', 'SE_f2247', 'LOG10P_f2247']
df3_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ_f2257','TEST', 'BETA_f2257', 'SE_f2257', 'LOG10P_f2257']
df4_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'A1FREQ_f2247_f2257', 'TEST', 'BETA_f2247_f2257', 'SE_f2247_f2257', 'LOG10P_f2247_f2257']

In [5]:
# Merge dataframes on common columns (CHROM and GENPOS)
merged_df = pd.merge(df1_selected, df2_selected, on=['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'TEST'], how='outer')
merged_df_1 = pd.merge(merged_df, df3_selected, on=['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'TEST'], how='outer')
merged_df_2 = pd.merge(merged_df_1, df4_selected, on=['CHROM', 'GENPOS', 'ID', 'ALLELE1', 'TEST'], how='outer')

In [6]:
mask = merged_df_2['CHROM'].str.contains('CHROM')

# Drop those rows
merged_df_3= merged_df_2[~mask]

In [None]:
merged_df_3[['LOG10P_f3393', 'LOG10P_f2247', 'LOG10P_f2257', 'LOG10P_f2247_f2257']] = merged_df_3[['LOG10P_f3393', 'LOG10P_f2247', 'LOG10P_f2257', 'LOG10P_f2247_f2257']].apply(pd.to_numeric, errors='coerce')

In [None]:
merged_df_3[['P_f3393', 'P_f2247', 'P_f2257', 'P_f2247_f2257']] = 10 ** -merged_df_3[['LOG10P_f3393', 'LOG10P_f2247', 'LOG10P_f2257', 'LOG10P_f2247_f2257']]

In [None]:
# Ensure 'TEST' column is a string type
merged_df_3['TEST'] = merged_df_3['TEST'].astype(str)

# Ensure 'P_1', 'P_2', 'P_3', 'P_4' columns are numeric types
merged_df_3[['P_f3393', 'P_f2247', 'P_f2257', 'P_f2247_f2257']] = merged_df_3[['P_f3393', 'P_f2247', 'P_f2257', 'P_f2247_f2257']].apply(pd.to_numeric, errors='coerce')


In [13]:
skato_result = merged_df_3.loc[(merged_df_3['TEST'] == 'ADD-SKATO')  & ((merged_df_3['P_f3393'] <= 2.5e-06) | (merged_df_3['P_f2247'] <= 2.5e-06) | (merged_df_3['P_f2257'] <= 2.5e-06) | (merged_df_3['P_f2247_f2257'] <= 2.5e-06))]

In [None]:
skato_result['CHROM'] = skato_result['CHROM'].astype(int)
skato_result['GENPOS'] = skato_result['GENPOS'].astype(int)

In [16]:
skato_result = skato_result.sort_values(by=['CHROM', 'GENPOS'], ascending=[True, True])

In [None]:
for allele, group in skato_result.groupby('ALLELE1'):
    # Define the filename using the 'ALLELE1' name
    filename = f'~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/ARHI_skato_CADD_{allele}_rap1_output.csv'
    
    # Save the group to the CSV file
    group.to_csv(filename, index=False, header=True)

    print(f'Saved group with ALLELE1="{allele}" to {filename}')


In [18]:
burden_result = merged_df_3.loc[(merged_df_3['TEST'] == 'ADD')  & ((merged_df_3['P_f3393'] <= 2.5e-06) | (merged_df_3['P_f2247'] <= 2.5e-06) | (merged_df_3['P_f2257'] <= 2.5e-06) | (merged_df_3['P_f2247_f2257'] <= 2.5e-06))]

## Look for those genes that are significant in SKAT-O but not in burden 

In [26]:
burden_pLoF=merged_df_3.loc[(merged_df_3['TEST'] == 'ADD')  & (merged_df_3['ID'] == 'SIX1.Mask1.0.005') | (merged_df_3['TEST'] == 'ADD')  & (merged_df_3['ID'] == 'TNRC6B.Mask1.0.005')]

In [28]:
filename = f'~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/ARHI_burden_CADD_Mask1_0.005_rap1_output_siginSKATO.csv'
# Save the group to the CSV file
burden_pLoF.to_csv(filename, index=False, header=True)

In [29]:
burden_mask2=merged_df_3.loc[(merged_df_3['TEST'] == 'ADD')  & (merged_df_3['ID'] == 'MYO7A.Mask2.0.005') | (merged_df_3['TEST'] == 'ADD')  & (merged_df_3['ID'] == 'TMPRSS3.Mask2.0.005') | (merged_df_3['TEST'] == 'ADD')  & (merged_df_3['ID'] == 'TXNDC17.Mask2.0.005')]

In [31]:
filename = f'~/UKBiobank/RAP/results/autosomal/aggregate/RAP1/ARHI_burden_CADD_Mask2_0.005_rap1_output_siginSKATO.csv'
# Save the group to the CSV file
burden_mask2.to_csv(filename, index=False, header=True)

In [None]:
burden_result['CHROM'] = burden_result['CHROM'].astype(int)
burden_result['GENPOS'] = burden_result['GENPOS'].astype(int)

In [20]:
burden_result = burden_result.sort_values(by=['CHROM', 'GENPOS'], ascending=[True, True])

In [None]:
for allele, group in burden_result.groupby('ALLELE1'):
    # Define the filename using the 'ALLELE1' name
    filename = f'/home/dmc2245/UKBiobank/RAP/results/autosomal/aggregate/RAP1/ARHI_burden_CADD_{allele}_rap1_output.csv'
    
    # Save the group to the CSV file
    group.to_csv(filename, index=False, header=True)

    print(f'Saved group with ALLELE1="{allele}" to {filename}')

## Create tables for single-variant analysis summary statistics

In [4]:
import pandas as pd

In [62]:
# Read the four files
df1 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Haid_070824/ref_last/f3393.regenie', comment='#', sep=" ", dtype="str")
df2 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Hdiff_070824/ref_last/f2247.regenie',comment='#', sep=" ",dtype="str")
df3 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Hnoise_070824/ref_last/f2257.regenie',comment='#', sep=" ",dtype="str")
df4 = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Hboth_070824/ref_last/f2247_f2257.regenie',comment='#', sep=" ",dtype="str")

In [86]:
# Select required columns for each phenotype
df1_selected = df1[['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ', 'BETA', 'SE', 'LOG10P']]
df2_selected = df2[['CHROM', 'GENPOS', 'ID', 'ALLELE0', 'ALLELE1', 'A1FREQ', 'BETA', 'SE', 'LOG10P']]
df3_selected = df3[['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ', 'BETA', 'SE', 'LOG10P']]
df4_selected = df4[['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ', 'BETA', 'SE', 'LOG10P']]

In [87]:
# Rename columns to include the phenotype name
df1_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ_f3393', 'BETA_f3393', 'SE_f3393', 'LOG10P_f3393']
df2_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ_f2247', 'BETA_f2247', 'SE_f2247', 'LOG10P_f2247']
df3_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ_f2257', 'BETA_f2257', 'SE_f2257', 'LOG10P_f2257']
df4_selected.columns = ['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1', 'A1FREQ_f2247_f2257', 'BETA_f2247_f2257', 'SE_f2247_f2257', 'LOG10P_f2247_f2257']

In [88]:
# Merge dataframes on common columns (CHROM and GENPOS)
merged_df = pd.merge(df1_selected, df2_selected, on=['CHROM', 'GENPOS', 'ID', 'ALLELE0', 'ALLELE1'], how='outer')
merged_df_1 = pd.merge(merged_df, df3_selected, on=['CHROM', 'GENPOS', 'ID', 'ALLELE0','ALLELE1'], how='outer')
merged_df_2 = pd.merge(merged_df_1, df4_selected, on=['CHROM', 'GENPOS', 'ID','ALLELE0', 'ALLELE1'], how='outer')

In [90]:
mask = merged_df_2['CHROM'].str.contains('CHROM')

# Drop those rows
merged_df_3= merged_df_2[~mask]

In [None]:
merged_df_3[['LOG10P_f3393', 'LOG10P_f2247', 'LOG10P_f2257', 'LOG10P_f2247_f2257']] = merged_df_3[['LOG10P_f3393', 'LOG10P_f2247', 'LOG10P_f2257', 'LOG10P_f2247_f2257']].apply(pd.to_numeric, errors='coerce')

In [None]:
merged_df_3[['P_f3393', 'P_f2247', 'P_f2257', 'P_f2247_f2257']] = 10 ** -merged_df_3[['LOG10P_f3393', 'LOG10P_f2247', 'LOG10P_f2257', 'LOG10P_f2247_f2257']]

In [None]:
merged_df_3[['P_f3393', 'P_f2247', 'P_f2257', 'P_f2247_f2257']] = merged_df_3[['P_f3393', 'P_f2247', 'P_f2257', 'P_f2247_f2257']].apply(pd.to_numeric, errors='coerce')

In [95]:
univar_result = merged_df_3.loc[ ((merged_df_3['P_f3393'] <= 5e-08) | (merged_df_3['P_f2247'] <= 5e-08) | (merged_df_3['P_f2257'] <=  5e-08) | (merged_df_3['P_f2247_f2257'] <=  5e-08))]

In [102]:
df1_anno = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Haid_070824/ref_last/Haid_pval5e-08_rap1.hg38.annotated.csv', comment='#', sep=",", dtype="str")
df2_anno = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Hdiff_070824/ref_last/Hdiff_pval5e-08_rap1.hg38.annotated.csv',comment='#', sep=",",dtype="str")
df3_anno = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Hnoise_070824/ref_last/Hnoise_pval5e-08_rap1.hg38.annotated.csv',comment='#', sep=",",dtype="str")
df4_anno = pd.read_csv('~/UKBiobank/RAP/results/autosomal/univariate/Hboth_070824/ref_last/Hboth_pval5e-08_rap1.hg38.annotated.csv',comment='#', sep=",",dtype="str")

In [126]:
# Rename columns to include the phenotype name
df1_anno.columns = ['ID','CHROM', 'GENPOS',  'ALLELE0','ALLELE1', 'A1FREQ_f3393', 'BETA_f3393', 'SE_f3393', 'P_f3393', 'Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred,gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CADD_phred','CLNSIG','CLNDN','medelian_ARHL_gene']
df2_anno.columns = ['ID','CHROM', 'GENPOS', 'ALLELE0','ALLELE1', 'A1FREQ_f2247', 'BETA_f2247', 'SE_f2247', 'P_f2247','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred,gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CADD_phred','CLNSIG','CLNDN','medelian_ARHL_gene']
df3_anno.columns = ['ID','CHROM', 'GENPOS', 'ALLELE0','ALLELE1', 'A1FREQ_f2257', 'BETA_f2257', 'SE_f2257', 'P_f2257','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred,gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CADD_phred','CLNSIG','CLNDN','medelian_ARHL_gene']
df4_anno.columns = ['ID','CHROM', 'GENPOS', 'ALLELE0','ALLELE1', 'A1FREQ_f2247_f2257', 'BETA_f2247_f2257', 'SE_f2247_f2257','P_f2247_f2257','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred,gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CADD_phred','CLNSIG','CLNDN','medelian_ARHL_gene']

In [127]:
merged_anno = pd.merge(df1_anno, df2_anno, on=['ID','CHROM', 'GENPOS',  'ALLELE0', 'ALLELE1','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred','gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CLNSIG','CLNDN','medelian_ARHL_gene'], how='outer')
merged_anno_1 = pd.merge(merged_anno, df3_anno, on=['ID','CHROM', 'GENPOS', 'ALLELE0','ALLELE1','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred','gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CLNSIG','CLNDN','medelian_ARHL_gene'], how='outer')
merged_anno_2 = pd.merge(merged_anno_1, df4_anno, on=['ID','CHROM', 'GENPOS','ALLELE0', 'ALLELE1','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred','gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CLNSIG','CLNDN','medelian_ARHL_gene'], how='outer')

In [129]:
anno_final=merged_anno_2[['ID','CHROM', 'GENPOS' ,'ALLELE0', 'ALLELE1','Func.refGene', 'Gene.refGene', 'AAChange.refGene', 'ExonicFunc.refGene', 'gwasCatalog', 'CADD_phred,gnomad312_AF_nfe', 'AF_nfe', 'avsnp150','CADD_phred','CLNSIG','CLNDN','medelian_ARHL_gene']]

In [131]:
final_df = pd.merge(univar_result, anno_final, on=['ID', 'CHROM', 'GENPOS', 'ALLELE0', 'ALLELE1'])

In [134]:
final_df['CHROM'] = final_df['CHROM'].astype(int)
final_df['GENPOS'] = final_df['GENPOS'].astype(int)

In [137]:
final_df = final_df.sort_values(by=['CHROM', 'GENPOS'], ascending=[True, True])

In [139]:
filename = f'~/UKBiobank/RAP/results/autosomal/univariate/071224_ARHI_univariate_anno_rap1_output.csv'
final_df.to_csv(filename, index=False, header=True)