## GangSTR catalog creation 
Hope Tanudisastro | Jan 24, 2022

### Importing variant catalog file from Illumina

In [135]:
import json, csv, re
f = open('Illuminavariant_catalog.json')

In [136]:
data = json.load (f)

### Extracting off-target loci, coordinates and motif

In [137]:
coordinates = []
motif = []
offtarget = []

k = "OfftargetRegions"
for i in data:
    coordinates.append(i['ReferenceRegion'])
    motif.append(i['LocusStructure'])
    if k in i.keys():
        offtarget.append(i["OfftargetRegions"])
    else:
        offtarget.append("b") #spacer
       

 Make each entry in `offtarget` a string separated by "," (format accepted by GangSTR)

In [138]:
off_target_index = []


for i in range(len(offtarget)):
    if offtarget[i]!="b": 
        offtarget[i] = ",".join(offtarget[i])
        off_target_index.append(i)

Check how many loci have complex (multiple) repeat structures and print out coordinates and motif

In [139]:
for i in coordinates: 
    if type(i) == type(coordinates): # complex repeats will have an array structure
        print(i)

['chr3:63912684-63912714', 'chr3:63912714-63912726']
['chr13:70139353-70139383', 'chr13:70139383-70139428']
['chr3:129172576-129172656', 'chr3:129172656-129172696', 'chr3:129172696-129172732']
['chr9:69037261-69037286', 'chr9:69037286-69037304']
['chr4:3074876-3074933', 'chr4:3074939-3074966']
['chr20:2652733-2652757', 'chr20:2652757-2652775']


In [140]:
for j in motif: 
    if j.count("(")>=2:
        print(j)

(GCA)*(GCC)+
(CTA)*(CTG)*
(CAGG)*(CAGA)*(CA)*
(A)*(GAA)*
(CAG)*CAACAG(CCG)*
(GGCCTG)*(CGCCTG)*


Extract indices of complex repeats

In [141]:
complex_repeats_index=[]
for i in coordinates: 
    if type(i) == type(coordinates):
        complex_repeats_index.append(coordinates.index(i))

In [142]:
print(complex_repeats_index)

[7, 8, 12, 17, 20, 23]


### Convert complex/impure repeats into the pure repeat structure format accepted by GangSTR

GangSTR removes TRs within 50bp of another TR and does not accept sequence interruptions in its catalog. 

#### Clean up coordinates of complex repeats
For `coordinates[17]`, second repeat structure was selected because first repeat is a homopolymer. `coordinates[17]` was not formally listed in GangSTR catalog. For `coordinates[8]`, second repeat structure was selected because it was longer than the first repeat structure in the reference genome. 

In [143]:
for k in complex_repeats_index:
    if k == 8: 
        coordinates[k] = coordinates[k][1]
    elif k == 17: 
        coordinates[k] = coordinates[k][1]
    else:
        coordinates[k] = coordinates[k][0]

#### Clean up motifs of complex repeats

In [144]:
for k in complex_repeats_index:
    if (motif[k].count(")")>=2):
        split_motif = motif[k].split(")",1)
        if k == 8: 
            motif[k] = split_motif[1]
        elif k == 17:
            motif[k] = split_motif[1]
        else: 
            motif[k]= split_motif[0]

### Clean up and prepare for GangSTR catalog format 

#### Remove regular expression characters from motif definition

In [145]:
for i in range(len(motif)): 
    line = motif[i]
    motif[i] = re.sub('[()*+]', '', line)

#### Create motif length attribute

In [146]:
motif_length = []
for i in range(len(motif)):
    motif_length.append(len(motif[i]))   

#### Create separate chromosome and coordinate attributes

In [147]:
print(coordinates[8])

chr13:70139383-70139428


In [148]:
chromosome = [] 
coordinate_1 =[]
coordinate_2 = []

for i in range(len(coordinates)): 
    chromosome.append((coordinates[i].split(":"))[0])
    coordinate_pair = coordinates[i].split(":")[1]
    #print(coordinate_pair)
    coordinate_1.append(coordinate_pair.split("-")[0])
    coordinate_2.append(coordinate_pair.split("-")[1])


### Create a BED file using the attributes

In [149]:
gangstr_catalog = open("gangstr_catalog_with_offtarget_042022.bed", "w")
for i in range(len(coordinates)): 
    if i in off_target_index:
        gangstr_catalog.write(chromosome[i]+"\t"+ coordinate_1[i] +"\t"+ coordinate_2[i]+"\t"+ str(motif_length[i])+"\t"+ motif[i]+"\t"+offtarget[i]+"\n")
    else:
        gangstr_catalog.write(chromosome[i]+"\t"+ coordinate_1[i] +"\t"+ coordinate_2[i]+"\t"+ str(motif_length[i])+"\t"+ motif[i]+"\n")
gangstr_catalog.close()