In [None]:
#install pylift 
%pip install pyliftover
%pip install tqdm


In [None]:
#import pandas for data manipulation and pylift for lifting coordinates from build38 to build37
import pandas as pd
from pyliftover import LiftOver
from tqdm import tqdm


In [None]:
#read illumina GSA3.0 file
dna_df = pd.read_csv('DNA.txt',delimiter='\t',low_memory=False,skiprows=10)
#get overview of columns in the illumina final report file
dna_df.columns.tolist()

In [None]:
#check if all markers are targeted 600K+
dna_df.shape

In [None]:
#brief overview of the data
dna_df.head()

In [None]:
#read loci to rsid conversion file can be downloaded from support section of GSA3.0 on infinium illumina downloads section
rsid_cov_df = pd.read_csv('rsidcov.txt',delimiter='\t',low_memory=False)
#get overview of how the conversion file 
rsid_cov_df.head()

In [None]:
#create a pandas data frame which will act as base for 23&me file. 
#23me format uses SNP name as defined in snpdb, chromosome type, position in the build37, genotype (combination of allele1 and allele2 plus(positive strand))
stagingdf = dna_df[['SNP Name','Chr','Position','Allele1 - Plus','Allele2 - Plus','Plus/Minus Strand']].copy()
#overview of filtered strand
stagingdf.head()

In [None]:

#verify shape of spliced data
stagingdf.shape

In [None]:
stagingdf['genotype'] = stagingdf['Allele1 - Plus']+stagingdf['Allele2 - Plus']

In [None]:
stagingdf.head()

In [None]:
lo = LiftOver('hg38', 'hg19')

In [None]:
#empty dataframe to hold liftover data
columns=['SNPname','chromosome','position','genotype']
df23me = pd.DataFrame(columns=columns) 

In [None]:
#liftover failure statistics
zero = 0
one = 0
two = 0
three = 0
four = 0
greaterfour = 0
nonecnt = 0
argzero = 0

for index,row in tqdm(stagingdf.iterrows(), total=len(stagingdf), desc="Processing rows"):
    liftover_result = lo.convert_coordinate('chr'+stagingdf.loc[index]['Chr'],stagingdf.loc[index]['Position'],stagingdf.loc[index]['Plus/Minus Strand'])
    if liftover_result is not None:
        if(len(liftover_result) == 0):
            zero = zero+1
        else:
            count = len(liftover_result[0])
            if(count ==0):
                argzero = argzero+1
            elif(count == 1):
                one = one+1
            elif(count == 2):
                two = two +1
            elif(count == 3):
                three = three+1
            elif(count == 4):
                four = four +1
            else:
                greaterfour = greaterfour+1
            
        

    else:
       nonecnt = nonecnt + 1  
    
stat = [zero, 
one, 
two, 
three, 
four, 
greaterfour, 
nonecnt, 
argzero]    

stat

In [None]:

#perform liftover
for index,row in tqdm(stagingdf.iterrows(), total=len(stagingdf), desc="Processing rows"):
    liftover_result = lo.convert_coordinate('chr'+stagingdf.loc[index]['Chr'],stagingdf.loc[index]['Position'],stagingdf.loc[index]['Plus/Minus Strand'])
    if liftover_result is not None and (len(liftover_result)>0) :
       new_row = pd.DataFrame({'SNPname': [stagingdf.loc[index]['SNP Name']], 'chromosome': [liftover_result[0][0].replace("chr", "")], 'position': [liftover_result[0][1]],'genotype': [stagingdf.loc[index]['genotype']]})
       df23me = pd.concat([df23me, new_row], ignore_index=True)

In [None]:
df23me.head()

In [None]:
#create final data frame
columns=['rsid','chromosome','position','genotype']
final = pd.DataFrame(columns=columns) 

In [None]:
#remove genotype that doesn't have high GT score
mask = df23me['genotype'].str.contains('-')
df23me = df23me[~mask]

In [None]:
#rsid conversion
for index,row in tqdm(df23me.iterrows(), total=len(df23me), desc="Processing rows"):
    rsid_row = rsid_cov_df[rsid_cov_df['Name'] == df23me.loc[index]['SNPname']]
    rsids = rsid_row['RsID'].str.split(',')
    for i in  range(len(rsids.iloc[0])):
      new_row = pd.DataFrame({'rsid': [rsids.iloc[0][i]], 'chromosome': [df23me.loc[index]['chromosome']], 'position': [df23me.loc[index]['position']],'genotype': [df23me.loc[index]['genotype']]})
      final = pd.concat([final, new_row], ignore_index=True)


In [None]:
final.head(20)

In [None]:
#final output file
final.to_csv('23andme_format.csv', sep='\t', index=False)