In [1]:
# Removing triallelic SNPs 
# (If a triallelic SNP is significant, keep it and remove the other one. If neither are significant then remove all)

import pandas as pd 

# import the GWAS imputed data

mycols = ['chrom','position', 'ref_allele', 'effect_allele', 'minor_allele', 'minor_AF', 'expected_case_minor_AC', 'low_confidence_variant', 'n_complete_samples', 'AC', 'ytx', 'beta', 'se', 'tstat', 'pval']

df_imputed = {}

for i in range(1, 23):
    df_imputed[i] = pd.read_csv(f"{i}_fullGWAS.tsv", sep=r"[\t:]", names=mycols, engine='python')


In [2]:
print(df_imputed[1].head()) # check to see it's there
print(df_imputed.keys())

   chrom  position ref_allele effect_allele minor_allele  minor_AF  \
0      1     69487          G             A            A  0.000006   
1      1     69569          T             C            C  0.000188   
2      1    139853          C             T            T  0.000006   
3      1    693731          A             G            G  0.115824   
4      1    707522          G             C            C  0.097281   

   expected_case_minor_AC  low_confidence_variant  n_complete_samples  \
0                 1.11928                    True              360420   
1                36.54850                    True              360420   
2                 1.10237                    True              360420   
3             22502.00000                   False              360420   
4             18899.50000                   False              360420   

            AC          ytx      beta        se     tstat      pval  
0      4.15294      1.01176 -0.089465  0.215551 -0.415051  0.678105  


In [3]:
# Now find the duplicated columns 

duplicates_df = {}

for i in range(1,23): 
    duplicates_df[i] = df_imputed[i][df_imputed[i].duplicated(subset=['position'],keep=False)]
    

In [4]:
# put the duplicates in an array

import numpy as np

array = {}

for i in range(1,23):
    array[i] = duplicates_df[i][["position"]].to_numpy() 


In [8]:
array[1][3] # array[i] is the duplicates in chromosome i

array([1186665])

In [7]:
duplicates_df[1].head()

Unnamed: 0,chrom,position,ref_allele,effect_allele,minor_allele,minor_AF,expected_case_minor_AC,low_confidence_variant,n_complete_samples,AC,ytx,beta,se,tstat,pval
634,1,901922,G,A,A,0.000357,69.3343,True,360420,257.255,67.851,0.00152,0.02762,0.055029,0.956115
635,1,901922,G,C,C,0.004481,870.639,False,360420,3230.38,879.949,0.000263,0.00762,0.034551,0.972438
2363,1,1186665,G,A,A,0.123826,24056.7,False,360420,89259.0,24129.0,0.001031,0.001545,0.666929,0.504818
2364,1,1186665,G,T,T,0.010815,2101.17,False,360420,7796.09,2114.47,0.004348,0.005223,0.832427,0.405168
5091,1,1885615,A,C,C,0.001063,206.464,False,360420,766.055,207.09,0.003304,0.016612,0.198905,0.842337


In [9]:
cleaned_df = {}

for i in range(1,23):
    cleaned_df[i] = df_imputed[i][df_imputed[i].apply(lambda x: (x["position"] in array[i] and x["pval"] < 0.00001) or x["position"] not in array[i], axis=1)]
                          

In [34]:
for i in range(1,23):
    print(cleaned_df[i][cleaned_df[i].duplicated(subset=["position"],keep=False)].head())

Empty DataFrame
Columns: [chrom, position, ref_allele, effect_allele, minor_allele, minor_AF, expected_case_minor_AC, low_confidence_variant, n_complete_samples, AC, ytx, beta, se, tstat, pval]
Index: []
Empty DataFrame
Columns: [chrom, position, ref_allele, effect_allele, minor_allele, minor_AF, expected_case_minor_AC, low_confidence_variant, n_complete_samples, AC, ytx, beta, se, tstat, pval]
Index: []
Empty DataFrame
Columns: [chrom, position, ref_allele, effect_allele, minor_allele, minor_AF, expected_case_minor_AC, low_confidence_variant, n_complete_samples, AC, ytx, beta, se, tstat, pval]
Index: []
Empty DataFrame
Columns: [chrom, position, ref_allele, effect_allele, minor_allele, minor_AF, expected_case_minor_AC, low_confidence_variant, n_complete_samples, AC, ytx, beta, se, tstat, pval]
Index: []
Empty DataFrame
Columns: [chrom, position, ref_allele, effect_allele, minor_allele, minor_AF, expected_case_minor_AC, low_confidence_variant, n_complete_samples, AC, ytx, beta, se, tst

In [35]:
# see how many duplicates there are 

for i in range(1,23):
    print(cleaned_df[i].shape)
    
for i in range(1,23):
    print(cleaned_df[i].shape[0] - (df_imputed[i].shape[0] - duplicates_df[i].shape[0]))

(948317, 15)
(1030243, 15)
(870283, 15)
(878477, 15)
(792479, 15)
(807681, 15)
(711559, 15)
(679865, 15)
(530103, 15)
(615661, 15)
(608395, 15)
(572274, 15)
(432288, 15)
(389047, 15)
(342441, 15)
(382762, 15)
(327510, 15)
(339309, 15)
(278584, 15)
(269395, 15)
(161590, 15)
(162893, 15)
2
4
1
2
0
6
0
4
0
5
7
3
2
2
3
1
1
0
0
0
0
0


In [37]:
# save unqiue output as a tsv file 

for i in range(1,23):
    cleaned_df[i].to_csv(f"{i}allcleaned.tsv", sep="\t") 

In [None]:
### make cleaned file into a VEP input file

for X in {1..22}
do
awk 'BEGIN{FS="\t"} {print $2"\t"$3"\t"$3"\t"$4"/"$5"\t""+"}' ${X}allcleaned.tsv >> ${X}_VEP1.tsv
done
 