In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import scipy.stats as stats

In [5]:
!fastq-dump -A SRR7263706 --split-3 --gzip -O /casa/chemin/p/bioinfo1_2023/data_presentation

Rejected 10604 READS because of filtering out non-biological READS
Read 10604 spots for SRR7263706
Written 10604 spots for SRR7263706


In [2]:
!cutadapt -a AAACTGGAATTCTCGGGTGCCAAGGC -g AATGATACGGCGACCACCGAGATCTACACGTTCAGTAATACGACTCACTATAGGG -m 15 -M 25 -o /casa/chemin/p/bioinfo1_2023/data_presentation/RBNS_tr.fastq.gz /casa/chemin/p/bioinfo1_2023/data_presentation/SRR7263706.fastq.gz

This is cutadapt 4.4 with Python 3.10.8
Command line parameters: -a AAACTGGAATTCTCGGGTGCCAAGGC -g AATGATACGGCGACCACCGAGATCTACACGTTCAGTAATACGACTCACTATAGGG -m 15 -M 25 -o /casa/chemin/p/bioinfo1_2023/data_presentation/RBNS_tr.fastq.gz /casa/chemin/p/bioinfo1_2023/data_presentation/SRR7263706.fastq.gz
Processing single-end reads on 1 core ...
Done           00:00:00        10,604 reads @  32.0 µs/read;   1.87 M reads/minute
Finished in 0.348 s (32.834 µs/read; 1.83 M reads/minute).

=== Summary ===

Total reads processed:                  10,604
Reads with adapters:                    10,534 (99.3%)

== Read fate breakdown ==
Reads that were too short:               3,296 (31.1%)
Reads that were too long:                  263 (2.5%)
Reads written (passing filters):         7,045 (66.4%)

Total basepairs processed:     1,654,224 bp
Total written (filtered):        107,216 bp (6.5%)

=== Adapter 1 ===

Sequence: AAACTGGAATTCTCGGGTGCCAAGGC; Type: regular 3'; Length: 26; Trimmed: 10415 times


In [4]:
!gunzip data_presentation/RBNS_tr.fastq.gz

In [1]:
with open("data_presentation/RBNS_tr.fastq") as f:
    lines = f.readlines()
    seq = {k: v for k, v in zip(lines[::4], lines[1::4])}
seq

{'@SRR7263706.1 1 length=156\n': 'GGGGAGCAGTCGTTG\n',
 '@SRR7263706.3 3 length=156\n': 'GGGGAGAGCGTGTTT\n',
 '@SRR7263706.4 4 length=156\n': 'GGGAAGCGGTCGTTT\n',
 '@SRR7263706.5 5 length=156\n': 'AGGGTATGATGCGTTT\n',
 '@SRR7263706.6 6 length=156\n': 'GGGTAGTGTGCGTTT\n',
 '@SRR7263706.8 8 length=156\n': 'GGGGCAAAGAGCGTG\n',
 '@SRR7263706.12 12 length=156\n': 'GGGAGTGTGTTGTTT\n',
 '@SRR7263706.13 13 length=156\n': 'GGGAGTTGCGTGTTT\n',
 '@SRR7263706.15 15 length=156\n': 'GGGCATCGTGAGTTT\n',
 '@SRR7263706.16 16 length=156\n': 'GGGGTGTAGTATGTA\n',
 '@SRR7263706.17 17 length=156\n': 'GGGTTGAGTATGTTT\n',
 '@SRR7263706.18 18 length=156\n': 'GGGTGGCAGTCGTTT\n',
 '@SRR7263706.21 21 length=156\n': 'GGGAGGCAGTCGTTG\n',
 '@SRR7263706.22 22 length=156\n': 'GGGGTAGTCAGCGTT\n',
 '@SRR7263706.24 24 length=156\n': 'GGGATTGGTGAGTTT\n',
 '@SRR7263706.27 27 length=156\n': 'GGGAGTGGGTTGTTT\n',
 '@SRR7263706.29 29 length=156\n': 'GGGTTGAGCGTGTCT\n',
 '@SRR7263706.30 30 length=156\n': 'GGGATGAGCGCGTTT\n',
 '@

In [1]:
seq = {}
with open("data_presentation/RBNS_tr.fastq") as f:
    for i, line in enumerate(f):
        if not i % 4:
            read = line.strip().split()[0]
            assert read not in seq # read in seq이면 error 발생
        elif i % 4 == 1:
            seq[read] = line.strip()
        else:
            continue
seq

{'@SRR7263706.1': 'GGGGAGCAGTCGTTG',
 '@SRR7263706.3': 'GGGGAGAGCGTGTTT',
 '@SRR7263706.4': 'GGGAAGCGGTCGTTT',
 '@SRR7263706.5': 'AGGGTATGATGCGTTT',
 '@SRR7263706.6': 'GGGTAGTGTGCGTTT',
 '@SRR7263706.8': 'GGGGCAAAGAGCGTG',
 '@SRR7263706.12': 'GGGAGTGTGTTGTTT',
 '@SRR7263706.13': 'GGGAGTTGCGTGTTT',
 '@SRR7263706.15': 'GGGCATCGTGAGTTT',
 '@SRR7263706.16': 'GGGGTGTAGTATGTA',
 '@SRR7263706.17': 'GGGTTGAGTATGTTT',
 '@SRR7263706.18': 'GGGTGGCAGTCGTTT',
 '@SRR7263706.21': 'GGGAGGCAGTCGTTG',
 '@SRR7263706.22': 'GGGGTAGTCAGCGTT',
 '@SRR7263706.24': 'GGGATTGGTGAGTTT',
 '@SRR7263706.27': 'GGGAGTGGGTTGTTT',
 '@SRR7263706.29': 'GGGTTGAGCGTGTCT',
 '@SRR7263706.30': 'GGGATGAGCGCGTTT',
 '@SRR7263706.31': 'GGGTAGGTGACGTTT',
 '@SRR7263706.33': 'GGGTTGCGGTCGTTT',
 '@SRR7263706.34': 'GGGAAGAGGTCGTTT',
 '@SRR7263706.35': 'GGGAAGTGAGCGTTT',
 '@SRR7263706.36': 'GGGTGGTGCGTGTTT',
 '@SRR7263706.37': 'GGGGAGCAGTCGTTG',
 '@SRR7263706.38': 'AAGGGGGGTAGTATTG',
 '@SRR7263706.40': 'GGGATAGTGGTGTTT',
 '@SRR7263706.41

with open("data_presentation/RBNS_tr.fastq") as f:
    readlist = f.read().splitlines()
    seq = {k: v for k, v in zip(readlist[::4], readlist[1::4])}
seq

In [2]:
not_GGG = {k:v for k,v in seq.items() if "GGG" not in v}
not_GGG

{'@SRR7263706.107': 'GGTAGTGTGAGTTTA',
 '@SRR7263706.221': 'GGTGTTGTGAGTTTT',
 '@SRR7263706.395': 'GTGAGAGTTGTGTTT',
 '@SRR7263706.440': 'GGTAGTGTGCGTTTG',
 '@SRR7263706.705': 'GGTGAGTACGGTTTA',
 '@SRR7263706.777': 'GGAGAGTCGCGTTTG',
 '@SRR7263706.786': 'AGGTGAGTGATGTGTT',
 '@SRR7263706.971': 'GGAGGCAGTCGTTTGA',
 '@SRR7263706.1099': 'GGTGAGGTGAGTTTT',
 '@SRR7263706.1383': 'GGAGTGAGCACGTTTT',
 '@SRR7263706.1450': 'GGTCATGGTGAGTTT',
 '@SRR7263706.1651': 'GGTCAGGAGGCGTTT',
 '@SRR7263706.1868': 'GGTAGTGTGCGTTTA',
 '@SRR7263706.2398': 'GAGGTAAGTTGTGTTT',
 '@SRR7263706.2435': 'GGTTGAGTGCGTTTC',
 '@SRR7263706.2712': 'GGAAGTGATGCGTTT',
 '@SRR7263706.2762': 'GGTGGAGTTGGTTTC',
 '@SRR7263706.2793': 'GGAAAAGTGAGTTTAG',
 '@SRR7263706.2864': 'GGTAGGAGGCGGTTT',
 '@SRR7263706.2975': 'TTGGAGGAGCGTGTT',
 '@SRR7263706.2982': 'GGAAGGTGAGCGTTT',
 '@SRR7263706.3115': 'CGGAGTAGAGTTTGTG',
 '@SRR7263706.3574': 'GGTGAAGTATGTTTG',
 '@SRR7263706.3583': 'AAAGGAGGATGTCGT',
 '@SRR7263706.3584': 'AGGTGGAGTATGTTT',
 '

In [3]:
len(not_GGG.keys())

76

In [26]:
not_GTTT = {k:v for k,v in seq.items() if "GTTT" not in v}
not_GTTT

{'@SRR7263706.1': 'GGGGAGCAGTCGTTG',
 '@SRR7263706.8': 'GGGGCAAAGAGCGTG',
 '@SRR7263706.16': 'GGGGTGTAGTATGTA',
 '@SRR7263706.21': 'GGGAGGCAGTCGTTG',
 '@SRR7263706.22': 'GGGGTAGTCAGCGTT',
 '@SRR7263706.29': 'GGGTTGAGCGTGTCT',
 '@SRR7263706.37': 'GGGGAGCAGTCGTTG',
 '@SRR7263706.38': 'AAGGGGGGTAGTATTG',
 '@SRR7263706.50': 'GAGGGAGGAGTGCGTT',
 '@SRR7263706.53': 'GAGGGGTGGTATTGTT',
 '@SRR7263706.56': 'GGGAGCAAGTCGTGC',
 '@SRR7263706.57': 'GGGGTGAGTTGGTTC',
 '@SRR7263706.62': 'GGGAGTGGTAAGTAA',
 '@SRR7263706.70': 'GGGGGAAGTCGTGTT',
 '@SRR7263706.73': 'GGGATGAGCGCGTCA',
 '@SRR7263706.74': 'GGGGGTAGTCGTGTAC',
 '@SRR7263706.99': 'GGGTGAGGAGCGTTC',
 '@SRR7263706.151': 'GGGAGGAGTCGGTTC',
 '@SRR7263706.158': 'GGGAGTATGGTGTAT',
 '@SRR7263706.159': 'GGGAGGTAGGCGTTG',
 '@SRR7263706.163': 'GGGGTATGAGTCGTT',
 '@SRR7263706.180': 'GGGATGTGAGCGTAC',
 '@SRR7263706.184': 'GGGTGGACGTCGTCA',
 '@SRR7263706.193': 'GGGGAGAGCGTCGTT',
 '@SRR7263706.194': 'GGGGTGAATAGTGTT',
 '@SRR7263706.207': 'GTGGGATTGAGTTGT',
 

In [27]:
len(not_GTTT.keys())

1240

In [8]:
df_seq = pd.DataFrame.from_dict(seq, orient="index", columns=["seq"])
df_seq

Unnamed: 0,seq
@SRR7263706.1,GGGGAGCAGTCGTTG
@SRR7263706.3,GGGGAGAGCGTGTTT
@SRR7263706.4,GGGAAGCGGTCGTTT
@SRR7263706.5,AGGGTATGATGCGTTT
@SRR7263706.6,GGGTAGTGTGCGTTT
...,...
@SRR7263706.10598,GGGAGTGTGGTGTTT
@SRR7263706.10599,GGGGAGCAGTCGTTT
@SRR7263706.10601,GGGGGAGGGAGTTGT
@SRR7263706.10602,GGGTAGAGTGCGTTTA


In [9]:
df_seq['gggPos'] = df_seq.apply(lambda x: x.seq.find("GGG"), axis=1)
df_seq

Unnamed: 0,seq,gggPos
@SRR7263706.1,GGGGAGCAGTCGTTG,0
@SRR7263706.3,GGGGAGAGCGTGTTT,0
@SRR7263706.4,GGGAAGCGGTCGTTT,0
@SRR7263706.5,AGGGTATGATGCGTTT,1
@SRR7263706.6,GGGTAGTGTGCGTTT,0
...,...,...
@SRR7263706.10598,GGGAGTGTGGTGTTT,0
@SRR7263706.10599,GGGGAGCAGTCGTTT,0
@SRR7263706.10601,GGGGGAGGGAGTTGT,0
@SRR7263706.10602,GGGTAGAGTGCGTTTA,0


In [17]:
df_seq['gtttPos'] = df_seq['gtttPos'] = df_seq.apply(lambda x: x.seq.find("GTTT"), axis=1)
df_seq

Unnamed: 0,seq,gggPos,gtttPos
@SRR7263706.1,GGGGAGCAGTCGTTG,0,-1
@SRR7263706.3,GGGGAGAGCGTGTTT,0,11
@SRR7263706.4,GGGAAGCGGTCGTTT,0,11
@SRR7263706.5,AGGGTATGATGCGTTT,1,12
@SRR7263706.6,GGGTAGTGTGCGTTT,0,11
...,...,...,...
@SRR7263706.10598,GGGAGTGTGGTGTTT,0,11
@SRR7263706.10599,GGGGAGCAGTCGTTT,0,11
@SRR7263706.10601,GGGGGAGGGAGTTGT,0,-1
@SRR7263706.10602,GGGTAGAGTGCGTTTA,0,11


In [18]:
df_seq['len_btn'] = df_seq['gtttPos'] - df_seq['gggPos']
df_seq

Unnamed: 0,seq,gggPos,gtttPos,len_btn
@SRR7263706.1,GGGGAGCAGTCGTTG,0,-1,-1
@SRR7263706.3,GGGGAGAGCGTGTTT,0,11,11
@SRR7263706.4,GGGAAGCGGTCGTTT,0,11,11
@SRR7263706.5,AGGGTATGATGCGTTT,1,12,11
@SRR7263706.6,GGGTAGTGTGCGTTT,0,11,11
...,...,...,...,...
@SRR7263706.10598,GGGAGTGTGGTGTTT,0,11,11
@SRR7263706.10599,GGGGAGCAGTCGTTT,0,11,11
@SRR7263706.10601,GGGGGAGGGAGTTGT,0,-1,-1
@SRR7263706.10602,GGGTAGAGTGCGTTTA,0,11,11


In [28]:
df_seq[df_seq['len_btn']!=11]

Unnamed: 0,seq,gggPos,gtttPos,len_btn
@SRR7263706.1,GGGGAGCAGTCGTTG,0,-1,-1
@SRR7263706.8,GGGGCAAAGAGCGTG,0,-1,-1
@SRR7263706.16,GGGGTGTAGTATGTA,0,-1,-1
@SRR7263706.21,GGGAGGCAGTCGTTG,0,-1,-1
@SRR7263706.22,GGGGTAGTCAGCGTT,0,-1,-1
...,...,...,...,...
@SRR7263706.10586,GTGGGGGTTTGTATT,2,6,4
@SRR7263706.10587,GGGAGGCAGTCGTTG,0,-1,-1
@SRR7263706.10594,GAGGGAGGTCGTAGT,2,-1,-3
@SRR7263706.10595,GGGATGAGTTTGTAT,0,7,7
