# SuPreMo get_seq walkthrough

In this walkthrough, we will run get_seq on CTCF binding site deletions, which we generated in [custom_perturbations.ipynb](https://github.com/ketringjoni/Akita_variant_scoring/blob/main/walkthroughs/custom_perturbations.ipynb).

To do so, we've run the following command (the output of the following cell) in the terminal. By changing the variables below, you can get a command that fits your custom set of perturbations. Note: there are more arguments available that are not included here.  

In [1]:
import pandas as pd
import os
import pysam

in_file = '../temp_input.txt'# '../test/input/CTCF_del_symb_alleles' # A set of 50 CTCF deletions
shift_by = '-1 0 1' # Shift sequences by 1 bp left and right
revcomp = 'add_revcomp' # Also score the reverse complement of all sequences
file = 'CTCF_del' # Output files prefix
directory = '../test/output' # Output directory
seq_length = 1048576 # deafault sequence length for Akita

print('Run this command in the main directory:\n')
print('python scripts/SuPreMo.py', in_file,
      '--shift_by', shift_by,
      '--revcomp', revcomp,
      '--file', file,
      '--dir', directory,
      '--get_seq')


Run this command in the main directory:

python scripts/SuPreMo.py ../temp_input.txt --shift_by -1 0 1 --revcomp add_revcomp --file CTCF_del --dir ../test/output --get_seq


In [7]:
# Get path to output files
out_file = os.path.join(directory, file)

In [8]:
variants = pd.read_csv(in_file, sep = '\t')

# Read input

import sys
sys.path.insert(0, '../scripts')

import reading_utils
reading_utils.var_set_size = 10000000

variants = reading_utils.read_input(in_file, 0)
variants

Unnamed: 0,CHROM,POS,REF,ALT,END,SVTYPE,SVLEN
0,chr1,100276250,-,-,100376250,DUP,100000


# Read in sequences

In [9]:
seq_file = out_file + '_sequences.fa'
seq_file = '../temp_output/score_var_results_sequences.fa'

seq_names = pysam.Fastafile(seq_file).references
seq_names[:10]

['0_0_0_[524287_474287]', '0_0_1_[524287_474287]']

Unnamed: 0,CHROM,POS,REF,ALT,END,SVTYPE,SVLEN
0,chr1,100276250,-,-,100376250,DUP,100000


In [90]:
# Get sequences

sequences = {}

shift_by = str(0)

for var_index in variants.index:
    for shift in shift_by.split(' '):
        for revcomp in ['']:

            if 'SVLEN' in variants.columns:
                
                SVTYPE = variants.iloc[var_index].SVTYPE
                SVLEN = variants.iloc[var_index].SVLEN
                if SVTYPE == 'DUP':
                    SVLEN = 2*SVLEN
            else:
                SVTYPE, SVLEN = '-', '0'


            if SVTYPE == 'BND':
                alleles = {0: 'REF1', 
                           1: 'REF2',
                           2: 'ALT'}
            else:
                alleles = {0: 'REF', 
                           1: 'ALT'}


            for allele in range(len(alleles)):
                variant_id = f'{var_index}_{shift}{revcomp}_{allele}'

                seq_name = [x for x in seq_names if variant_id in x][0]

                position = int(seq_name.split('[')[1].split(']')[0].split('_')[allele])

                sequence = pysam.Fastafile(seq_file).fetch(seq_name, 0, seq_length).upper()

                sequences.update({str(var_index)+'_'+alleles[allele]:sequence})

                # Look at the region where the variant is in that sequence
                print('var_index =',var_index, '\tshift =', shift, revcomp,
                      '\tstart of variant:', sequence[position:position+SVLEN][:10],
                      '\tend of variant:', sequence[position:position+SVLEN][-10:])

var_index = 0 	shift = 0  	start of variant: TCCCTTATAT 	end of variant: GTCAAGAGTT
var_index = 0 	shift = 0  	start of variant: TCCCTTATAT 	end of variant: ATGTATACAT


In [93]:
sequences.keys()

dict_keys(['0_REF', '0_ALT'])

In [92]:
sequences

{'0_REF': 'CAAAATAAATGACAAGATATAAGGGATGGTGAGATAGAGCAGGGACCTCTCTCTTAGGGACCTACCTCTCTCAGCCTTCATAGAATTAAACAAAGGTAGGACCTTGCTCTGGATTAGACTTTGGCTTAAGGAAATGTTGTGGTTGGTTTGATCTTCTATCCAGACCACTAAAACTTTCTCCATATCAACAATAAGGCTGTTTTGCCTTCTTATCATTCATATGTTCACTGGAGTAACCATTTTGTTCAAGAACTTTCTCTTTGCATTTACAACTTGGCTGACTGGAACAAGAGGCCTGACTTTCAGCCACTCTTGGCTTTTGGCATGCCTTCCTCACTAAGCTTAATCATTTCTAGCTTTTTATTGAAAGTGAAAGATATATGACTCTTCCTTTCACTTGAACACTTACAGGCCATTATAGGGTTCTTAAATGGCCCAATTTCAATATTGTTGTGCCTCAGGGAATAGGGAGGCCTGAGGAGAAGGAGAGAGATTGGAGAATAGCCAGTTGGTAGAGCAGTCAGAACATACACAGTTATCAATTAAGTTTGCCATCTTATACGGGTGCAGTTTGTGTCTCTCCAAAACAATAGAAACATCAAAGATCACTGATCACAGATCACCATAACAGATATAATGAAAAAGTTTAGAAACTTGCAAAAGTGTGGCCAAGCGTGGTGGCTCACACCTGTAATCCCCGCATTTTGGGAGGCCAAGTTGGGAGGATCACTTGAGCCCAGTAGTTCAAGACCAGCCTGGGCAATATAATGAGACCCCCATCTCTATTTTTTAATTAAAAATAACAAATAACAAATAATAAAATAGAAATTTACCAAAATGTGACATAGAGACACAAAGTGAGCACATGCTGTTGGAAAAATGGTGCCAACAGACTTGCTCAATGAGGAGTTGCCACAAGCCTTCAATTTGTAGCCAGGAACAGGACAAGGGGGCTGGTCTCCAAACCCTGGGCTCAAGGGAACCTCAAC