In [17]:
import os
import gzip
import re
import glob
from functools import reduce
from collections import defaultdict, Counter
from itertools import product
from toolz import groupby, sliding_window, concat
from more_itertools import windowed
from Bio import SeqIO
import csv
import pandas as pd
import numpy as np
from scipy.stats import norm
# import fasta_parser

In [2]:
def slidingWindow(sequence,winSize,step):
	"""Returns a generator that will iterate through
	the defined chunks of input sequence.  Input sequence
	must be iterable.
	From scipher.wordpress.com"""
	# Verify the inputs
	try: it = iter(sequence)
	except TypeError:
		raise Exception("**ERROR** sequence must be iterable.")
	if not ((type(winSize) == type(0)) and (type(step) == type(0))):
		raise Exception("**ERROR** type(winSize) and type(step) must be int.")
	if step > winSize:
		raise Exception("**ERROR** step must not be larger than winSize.")
	if winSize > len(sequence):
		raise Exception("**ERROR** winSize must not be larger than sequence length.")

	# Pre-compute number of chunks to emit
	numOfChunks = int(((len(sequence)-winSize)/step)+1)
	# Do the work
	return numOfChunks

In [9]:
def get_chunks(sequence, window_size, step=1):
    """Returns a chunk of length of window_size and the end of the window size
    
    Inputs:
        sequence - a string representing a sequence.
        window_size - integer representing the length of the
                      chunk/window.
        step - a interger representing the length of the overlap window.
    
    Outputs:
        chunk - a string of length 'window'.
        end - a integer representing the end of the chunk.
    """
    # get the sequence length
    k = len(sequence)
    # get the index for each end and chunk
    for i in range(0, k - window_size + 1, step):
        # generate the end of the window
        end = i + window_size
        # get the slice of the sequence
        chunk = sequence[i:i + window_size]
        # assure the the chunk is the expected size
        assert len(chunk) == window_size
        yield chunk, end

In [15]:
def calc_org_tetra(fn, org):
    """ 
    Original code from Leighton Pritchard, leighton.pritchard@hutton.ac.uk
    redistributed and modified it under the terms of the GNU General 
    Public License as published by the Free Software Foundation, either 
    version 3 of the License, or (at your option) any later version.
    Calculate the tetranucleotide frequencies
    for each sequence, on each strand, and follow Teeling et al. (2004)
    in calculating a corresponding Z-score for each observed
    tetranucleotide frequency, dependent on the mono-, di- and tri-
    nucleotide frequencies for that input sequence.
    """
    import collections
    from Bio import SeqIO
    import math
    org_tetraz = {}
    # For the Teeling et al. method, the Z-scores require us to count
    # mono, di, tri and tetranucleotide sequences
    monocnt, dicnt, tricnt, tetracnt = (collections.defaultdict(int),
                                        collections.defaultdict(int),
                                        collections.defaultdict(int),
                                        collections.defaultdict(int))

    for s in [str(fn).upper(),
              str(fn.reverse_complement()).upper()]:
        # Since the Teeling et al. algorithm requires us to consider
        # both strand orientations, monocounts are easy
        monocnt['G'] += s.count('G')
        monocnt['C'] += s.count('C')
        monocnt['T'] += s.count('T')
        monocnt['A'] += s.count('A')
        # For di, tri and tetranucleotide counts, we loop over the
        # sequence and its reverse complement, until we're near the end:
        for i in range(len(s[:-4])):
            di, tri, tetra = s[i:i+2], s[i:i+3], s[i:i+4]
            dicnt[str(di)] += 1
            tricnt[str(tri)] += 1
            tetracnt[str(tetra)] += 1
        # We clean up the straggling bit at the end:
        tricnt[str(s[-4:-1])] += 1
        tricnt[str(s[-3:])] += 1
        dicnt[str(s[-4:-2])] += 1
        dicnt[str(s[-3:-1])] += 1
        dicnt[str(s[-2:])] += 1
    # Following Teeling (2004), we calculate expected frequencies for each
    # tetranucleotide; we ignore ambiguity symbols
    tetra_exp = {}
    for t in [tet for tet in tetracnt if tet_clean(tet)]:
        tetra_exp[t] = 1.*tricnt[t[:3]]*tricnt[t[1:]]/dicnt[t[1:3]]
    # Following Teeling (2004) we approximate the std dev of each
    # tetranucleotide
    tetra_sd = {}
    for t, exp in tetra_exp.items():
        den = dicnt[t[1:3]]
        tetra_sd[t] = math.sqrt(exp * (den - tricnt[t[:3]]) * \
                                    (den - tricnt[t[1:]]) / (den * den))
    # Following Teeling (2004) we calculate the Z-score for each
    # tetranucleotide
    tetra_z = {}
    for t, exp in tetra_exp.items():
        try:
            tetra_z[t] = (tetracnt[t] - exp)/tetra_sd[t]
        except ZeroDivisionError:
            # We hit a zero in the estimation of variance
            zeroes = [k for k,v in tetra_sd.items() if v == 0]
            tetra_z[t] = 1 / (dicnt[t[1:3]] * dicnt[t[1:3]])
    org_tetraz[org] = tetra_z
    return org_tetraz

In [31]:
def tet_clean(s):
    """ 
    Original code from Leighton Pritchard, leighton.pritchard@hutton.ac.uk
    redistributed and modified it under the terms of the GNU General 
    Public License as published by the Free Software Foundation, either 
    version 3 of the License, or (at your option) any later version.
    Checks that a passed string contains only unambiguous IUPAC nucleotide
    symbols. We are assuming that a low frequency of IUPAC ambiguity symbols
    doesn't affect our calculation.
    """
    if not len(set(s) - set('ACGT')):
        return True
    return False

In [29]:
with gzip.open("Data/Genomes_splitted/Thermodesulfatator/Chromosomes/GCF_000217795.1_chr.fna.gz", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        fn = record.seq
        Id = record.id

In [32]:
calc_org_tetra(fn, Id)

{'GCF_000217795.1_chr': {'AATT': 7.343080795525697,
  'ATTT': 7.998999195921277,
  'TTTC': -9.65184982065855,
  'TTCT': 12.704943761837193,
  'TCTC': 1.5119713503675307,
  'CTCG': 14.435064963444294,
  'TCGA': -44.82793657051853,
  'CGAG': 14.435064963444294,
  'GAGA': 1.5119713503675307,
  'AGAT': -3.561859184031775,
  'GATA': 30.91922082476304,
  'ATAA': 19.644760778624175,
  'TAAG': -39.61418839563839,
  'AAGT': 8.736492682546613,
  'AGTA': -0.9428282793931057,
  'GTAT': -27.31832076727808,
  'TATT': 24.774989641315845,
  'TTTT': -11.006727948613154,
  'TTTG': 7.12260098955833,
  'TTGT': 4.091789563330711,
  'TGTT': 20.029464125435712,
  'GTTC': -4.873442645542426,
  'TTCA': -10.766586114836791,
  'TCAA': 27.566610289572814,
  'CAAA': 7.12260098955833,
  'AAAA': -11.006727948613154,
  'AAAG': -1.0276197400443996,
  'AAGC': -23.696105700198707,
  'AGCT': 4.605017056157665,
  'GCTA': 27.168978939458025,
  'CTAT': 27.763377573961595,
  'TTTA': 16.208516333481317,
  'TTAG': -24.46699293

In [4]:
handle = gzip.open("Data/Genomes_splitted/Thermodesulfatator/Chromosomes/GCF_000217795.1_chr.fna.gz", "rt")
header = handle.readline()
a = handle.readlines()
# a = ''.join([x.strip() for x in a])

In [5]:
a = ''.join([x.strip() for x in a])

In [8]:
slidingWindow(a, 2000, 500)

4641

In [11]:
len(list(get_chunks(a, 2000, 500)))

4641

In [11]:
conv = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
b = np.array([conv[x] for x in a], dtype=np.uint8)

In [19]:
b.sum()

3481219

In [18]:
len(rolling_window(b,100)[0])

100

In [3]:
dir_in = 'Results'
sub_dir = 'kmer_counts'
ext = 'csv'

In [6]:
names = os.listdir('Results/kmer_counts')

In [37]:
names[0]

'Methyloligella'

In [48]:
def csv_data(dir_in, sub_dir, names):
    filenames = []
    for name in names:
        filename = ''.join(glob.glob(f'{dir_in}/{sub_dir}/{name}/{name}_k2_8_chr.csv'))
        filenames.append(filename)
    return filenames

In [50]:
csv_files = csv_data(dir_in, sub_dir, names)

In [55]:
csv_files

['Results/kmer_counts/Methyloligella/Methyloligella_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/Xenorhabdus_k2_8_chr.csv',
 'Results/kmer_counts/Undibacterium/Undibacterium_k2_8_chr.csv',
 'Results/kmer_counts/Tenuifilum/Tenuifilum_k2_8_chr.csv',
 'Results/kmer_counts/Dyadobacter/Dyadobacter_k2_8_chr.csv',
 'Results/kmer_counts/Alloactinosynnema/Alloactinosynnema_k2_8_chr.csv',
 'Results/kmer_counts/Parolsenella/Parolsenella_k2_8_chr.csv',
 'Results/kmer_counts/Rhodobacter/Rhodobacter_k2_8_chr.csv',
 'Results/kmer_counts/Limnobaculum/Limnobaculum_k2_8_chr.csv',
 'Results/kmer_counts/Caldithrix/Caldithrix_k2_8_chr.csv',
 'Results/kmer_counts/Thermanaerovibrio/Thermanaerovibrio_k2_8_chr.csv',
 'Results/kmer_counts/Chromohalobacter/Chromohalobacter_k2_8_chr.csv',
 'Results/kmer_counts/Croceibacter/Croceibacter_k2_8_chr.csv',
 'Results/kmer_counts/Paenarthrobacter/Paenarthrobacter_k2_8_chr.csv',
 'Results/kmer_counts/Herminiimonas/Herminiimonas_k2_8_chr.csv',
 'Results/kmer_counts/Chl

In [54]:
len(csv_files)

1268

In [53]:
def 
for csv in csv_files:
    df = pd.read_csv(csv, header=None)
    print(df)

              0       1
0            AA  117734
1            TT  127034
2            TC  216731
3            TA   56838
4            GT  134515
...         ...     ...
87371  CCCCCCGG      23
87372  CCCCCCGT      11
87373  CCCCCCTA      12
87374  CTTTTTTG      32
87375  TTTTTTTT      11

[87376 rows x 2 columns]
              0         1
0            AA  399017.0
1            TC  255831.0
2            GT  229736.0
3            TA  260293.0
4            AC  226692.0
...         ...       ...
87372  CGACTGTC      35.0
87373  CGACTGTA      28.0
87374  CGACTGGT      34.0
87375  CGACTTGT      30.0
87376       NaN       2.0

[87377 rows x 2 columns]
              0         1
0            AA  405724.0
1            TG  446966.0
2            CG  349044.0
3            GG  331012.0
4            TT  405384.0
...         ...       ...
87371  CGACGAAC      50.0
87372  CGACGAAA      86.0
87373  CGACCTTT      64.0
87374  CGACGATT      94.0
87375  TTTTTTTT     283.0

[87376 rows x 2 columns]
          

              0       1
0            AA  163224
1            NN    1039
2            TT  163175
3            TG   78193
4            TC   64127
...         ...     ...
87434  CCCCCTAA       2
87435  CCCCCTAC       2
87436  CCCAACAT       9
87437  NAAATCTT       2
87438       NaN       2

[87439 rows x 2 columns]
              0         1
0            AA  581220.0
1            GG  246828.0
2            GT  262108.0
3            AT  517328.0
4            TA  388173.0
...         ...       ...
88188  CGAGAAGA      61.0
88189  CGAGAACT      38.0
88190  CGAGACCT      25.0
88191  TTTTTTTT     673.0
88192       NaN       3.0

[88193 rows x 2 columns]
              0         1
0            AA  278737.0
1            TG  211153.0
2            CG  307082.0
3            GG  250912.0
4            TT  281352.0
...         ...       ...
87371  CGACGAAC      64.0
87372  CGACGAAA     128.0
87373  CGACCTTT      96.0
87374  CGACGATT     108.0
87375  TTTTTTTT     148.0

[87376 rows x 2 columns]
          

              0       1
0            AA  261755
1            CN       1
2            TG  322934
3            TC  227012
4            TA  138199
...         ...     ...
87406  CCCCCGCT      46
87407  CNACCCTG       1
87408  CCCAAAGG      62
87409  NACCCTGT       1
87410       NaN       1

[87411 rows x 2 columns]
              0       1
0            AA  148869
1            TT  146448
2            TC  198542
3            TA  170035
4            GT  160210
...         ...     ...
87371  CCCCCCGG      38
87372  CCCCCCGT      10
87373  CCCCCCTA      28
87374  CTTTTTTG      31
87375  TTTTTTTT       0

[87376 rows x 2 columns]
              0         1
0            AA  208643.0
1            TG   94310.0
2            CG   22812.0
3            GG   47821.0
4            TT  209513.0
...         ...       ...
87371  CGACGAAC       2.0
87372  CGACGAAA       1.0
87373  CGACCTTT       8.0
87374  CGACGATT       4.0
87375  TTTTTTTT     257.0

[87376 rows x 2 columns]
              0       1
0         

              0         1
0            AA  150290.0
1            TG  225746.0
2            CG  532592.0
3            GG  334167.0
4            TT  149323.0
...         ...       ...
87371  CGACGAAC     170.0
87372  CGACGAAA     152.0
87373  CGACCTTT      55.0
87374  CGACGATT     111.0
87375  TTTTTTTT       7.0

[87376 rows x 2 columns]
              0         1
0            AA  378338.0
1            NC       1.0
2            NG       2.0
3            NN    1402.0
4            CG   22770.0
...         ...       ...
87842  CGACTAAC       8.0
87843  CGACTAAA      12.0
87844  CGACTCAA       4.0
87845  TTTTTTTT     338.0
87846       NaN       4.0

[87847 rows x 2 columns]
              0         1
0            AA   80738.0
1            TG  182436.0
2            CG  447256.0
3            GG  300404.0
4            TT   85147.0
...         ...       ...
87371  CGACGAAC     258.0
87372  CGACGAAA      79.0
87373  CGACCTTT      17.0
87374  CGACGATT      96.0
87375  TTTTTTTT       0.0

[87376 rows

              0         1
0            AA  232952.0
1            GG   52571.0
2            TC   98192.0
3            CT  114837.0
4            AT  159578.0
...         ...       ...
87406  CGACGACT       0.0
87407  CGACGACG       0.0
87408  CGACGACC       2.0
87409  CGACGCCC       0.0
87410  TTTTTTTT     155.0

[87411 rows x 2 columns]
              0       1
0            AA   61330
1            TT   60188
2            TC  330938
3            TA   37403
4            GT  264411
...         ...     ...
87371  CCCCCCGG     118
87372  CCCCCCGT      44
87373  CCCCCCTA       2
87374  CTTTTTTG       0
87375  TTTTTTTT       0

[87376 rows x 2 columns]
              0         1
0            AA  138008.0
1            TG  243588.0
2            CG  334072.0
3            GG  295170.0
4            TT  141397.0
...         ...       ...
87371  CGACGAAC     100.0
87372  CGACGAAA      87.0
87373  CGACCTTT      36.0
87374  CGACGATT      50.0
87375  TTTTTTTT      34.0

[87376 rows x 2 columns]
          

              0         1
0            AA  274168.0
1            TG  345416.0
2            CG  355017.0
3            GG  313663.0
4            TT  276892.0
...         ...       ...
87371  CGACGAAC      92.0
87372  CGACGAAA      86.0
87373  CGACCTTT      53.0
87374  CGACGATT      82.0
87375  TTTTTTTT     136.0

[87376 rows x 2 columns]
              0         1
0            AA  273249.0
1            TG  415496.0
2            CG  661416.0
3            GG  429470.0
4            TT  275895.0
...         ...       ...
87371  CGACGAAC     268.0
87372  CGACGAAA     192.0
87373  CGACCTTT      81.0
87374  CGACGATT     234.0
87375  TTTTTTTT      18.0

[87376 rows x 2 columns]
              0       1
0            AA  275566
1            TT  275604
2            TC  226058
3            TA  158504
4            GT  242263
...         ...     ...
87371  CCCCCCGG       2
87372  CCCCCCGT       2
87373  CCCCCCTA       2
87374  CTTTTTTG     191
87375  TTTTTTTT      67

[87376 rows x 2 columns]
          

              0       1
0            AA  120825
1            TT  119611
2            TC   45020
3            TA  124213
4            GT   42325
...         ...     ...
87371  CCCCCCGG       1
87372  CCCCCCGT       1
87373  CCCCCCTA       1
87374  CTTTTTTG      33
87375  TTTTTTTT     185

[87376 rows x 2 columns]
              0         1
0            AA  279939.0
1            TG  334222.0
2            GG  350314.0
3            AC  255969.0
4            GA  286647.0
...         ...       ...
87372  CTACTGAA      28.0
87373  CTACTCTT      23.0
87374  CTACTCTG      30.0
87375  CTACTCTA       8.0
87376       NaN      71.0

[87377 rows x 2 columns]
              0         1
0            AA   93932.0
1            GG  185993.0
2            GA  177593.0
3            NN   16441.0
4            NT       2.0
...         ...       ...
87689  CGACTAAA       4.0
87690  CGACGTTT      53.0
87691  CGACTATT      13.0
87692  TTTTTTTT       4.0
87693       NaN       1.0

[87694 rows x 2 columns]
          

              0       1
0            AA  201721
1            TT  204407
2            TC  113990
3            TA  134487
4            GT  107740
...         ...     ...
87371  CCCCCCGG       0
87372  CCCCCCGT       3
87373  CCCCCCTA       6
87374  CTTTTTTG     163
87375  TTTTTTTT     132

[87376 rows x 2 columns]
              0         1
0            AA  149324.0
1            TG  322116.0
2            CG  614062.0
3            GG  388500.0
4            TT  148090.0
...         ...       ...
87371  CGACGAAC     286.0
87372  CGACGAAA     146.0
87373  CGACCTTT      38.0
87374  CGACGATT     108.0
87375  TTTTTTTT       5.0

[87376 rows x 2 columns]
              0         1
0            AA  154278.0
1            TG  313604.0
2            CG  761159.0
3            GG  495250.0
4            TT  155526.0
...         ...       ...
87371  CGACGAAC     304.0
87372  CGACGAAA     132.0
87373  CGACCTTT      58.0
87374  CGACGATT     107.0
87375  TTTTTTTT       4.0

[87376 rows x 2 columns]
          

FileNotFoundError: [Errno 2] No such file or directory: ''

In [30]:
csv_data(dir_in, sub_dir, names[0])

'Methyloligella_chr_k2_8_chr.csv'

In [78]:
def get_names(filename):
    """
    Reads a text file and returns a list of names.
    
    Inputs:
        filename - a string representing a text file with one name
                   by row.
    Outputs:
        names - a list-like object representin a list of names.
    """
    names = []
    with open(filename, 'r') as fh:
        for line in fh:
            sp = line.strip()
            names.append(sp)
    return names

In [82]:
one_csv = defaultdict(int)

with open('number_files_genus2.txt', 'r') as fh:
    for line in fh:
        num, name = int(line.strip().split(',')[0]), line.strip().split(',')[1]
        if num == 1:
            one_csv[name] = one_csv.get(name, 0) + num

In [103]:
name_one_csvs[0]

'Acidothermus'

In [89]:
name_one_csvs = list(one_csv.keys())
dir_in = 'Results'
sub_dir = 'kmer_counts'
ext = 'csv'
csv_one_files = defaultdict(list, [(n, []) for n in name_one_csvs])
for name in name_one_csvs:
        csv_paths = glob.glob(f'{dir_in}/{sub_dir}/{name}/*.{ext}')
        csv_one_files[name] += csv_paths

In [91]:
dir_out = os.path.join(dir_in, sub_dir)

In [105]:
for name in name_one_csvs:
    for filename in csv_one_files[name]:
        df = pd.read_csv(filename, 
                         header=None, 
                         names=['kmer', 
                                'count']).sort_values(by="kmer",
                                                      key=lambda x: x.str.len()).reset_index(drop=True)
        # Results/kmer_counts/name
        dir_out = os.path.join(dir_in, sub_dir, name)
        #name_k2_8_chr.csv
        csv_name = f'{name}_k2_8_chr.csv'
        if os.path.exists(dir_out):
            pass
        else:
            os.makedirs(dir_out)
        df.to_csv(f'{dir_out}/{csv_name}', header=False, index=False)    

In [38]:
def get_files_paths(dir_name, sub_dir_name, text_file, ext):
    """Returns a dictionary like object using baterial genera
    as keys and a list of path to fasta fileas as values.
    Inputs:
        dir_name: directory name
        sub_dir_name : sub directory name
    Outputs:
        dictionary: with genera name and a list of 
                    files.
    
    Example:
    dir is like: Data/bacteria_splitted/Mycolicibacillus/chromosomes
                 Data/bacteria_splitted/Mycolicibacillus/plasmids (if the case)
    dirname: 'Data/bacteria_splitted'
    sub_dir: 'chromosomes'
    fasta_dicts = get_files(dir_name, 'chromosomes')
    fasta_dicts['Mycolicibacillus']
    ['NZ_AP022594.1_Mycolicibacillus_koreensis_strain_JCM_19956.fna.gz']
    """
    # create a list of file and sub directories
    # names in the given directory
    spc_names = get_species_name(text_file)
    csv_files = defaultdict(list, [(n, []) for n in spc_names])
    for name in spc_names:
        csv_paths = glob.glob(f'{dir_name}/{sub_dir_name}/{name}/*.{ext}')
        csv_files[name] += csv_paths
    return csv_files

In [40]:
csv_paths = get_files_paths('Results', 'kmer_counts', 'number_files_genus.txt', 'csv')

In [80]:
csv_paths

defaultdict(list,
            {'Acidiphilium': ['Results/kmer_counts/Acidiphilium/GCF_000016725.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidiphilium/GCF_000202835.1_chr_k2_8_chr.csv'],
             'Acidipropionibacterium': ['Results/kmer_counts/Acidipropionibacterium/GCF_001602115.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_004011075.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_004011055.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_001441165.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_000310065.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_900637925.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_005890155.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_003956085.1_chr_k2_8_chr.csv',
              'Results/kmer_c

In [71]:
def concatenation_dfs(spc_name, dict_csvs_paths):
    csvs = dict_csvs_paths[spc_name]
    concat_df = pd.DataFrame()
    for csv in csvs:
        temp_df = pd.read_csv(csv, header=None, names=['kmer', 'count']).set_index("kmer", drop=True)
        concat_df = pd.DataFrame(pd.concat([concat_df,
                                            temp_df],
                                           axis=1,
                                           join='outer',
                                           copy=False).sort_index().mean(axis=1))
        del temp_df
    concat_df[0] = concat_df[0].astype('float64').round()
    return concat_df.reset_index().rename(columns={'index': 'kmer', 0: 'count'})

In [72]:
df = concatenation_dfs('Acidiphilium', 
                       csv_paths).sort_values(by="kmer", 
                                              key=lambda x: x.str.len()).reset_index(drop=True)

In [73]:
df

Unnamed: 0,kmer,count
0,AA,101610.0
1,TG,191494.0
2,CG,522618.0
3,GG,326533.0
4,TT,102896.0
...,...,...
87371,CGACGAAC,196.0
87372,CGACGAAA,114.0
87373,CGACCTTT,36.0
87374,CGACGATT,104.0


In [74]:
df.to_csv('test.csv', header=None, index=False)

In [125]:
file_amb = 'ambiguous.txt'
names_ambigous = defaultdict(list)

with open(file_amb, 'r') as fh:
    for line in fh:
        p_name, num = line.strip().split(':')[0], int(line.strip().split(':')[1])
        name = line.strip().split('/')[0]
        if num > 0 and name in p_name:
            names_ambigous[name] = names_ambigous.get(name, []) + [os.path.join(dir_in, sub_dir, p_name)]

In [147]:
for name, filename in names_ambigous.items():
    for data in filename:
        print(data)

Results/kmer_counts/Acinetobacter/Acinetobacter_k2_8_chr.csv
Results/kmer_counts/Acinetobacter/GCF_000018445.1_chr_k2_8_chr.csv
Results/kmer_counts/Acinetobacter/GCF_006351765.1_chr_k2_8_chr.csv
Results/kmer_counts/Acinetobacter/GCF_009931315.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/Aeromonas_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_000633175.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_000635955.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902695.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902715.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902735.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902775.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902795.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902815.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902855.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902875.1_chr_k2_8_chr.csv
Results/kmer_counts/Aeromonas/GCF_016902915.1_chr_k2_8_

In [None]:
def get_kmer_frequency(csv_file):
    tot = 0
    freq = defaultdict(float)
    with open(csv_file, 'r') as fh:
        data = csv.reader(fh)
        for row in data:
            kmer, cnt = row[0], float(row[1])
            freq[kmer] = freq.get(kmer, 0.0) + cnt
            tot += cnt
    return {k: cnt / tot for k, cnt in freq.items()}

In [149]:
glob.glob('Results/kmer_counts/*/*_k2_8_chr.csv')

['Results/kmer_counts/Methyloligella/Methyloligella_k2_8_chr.csv',
 'Results/kmer_counts/Methyloligella/GCF_013341275.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_001721185.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_000968175.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_014295015.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_003575005.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_000027225.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_000973125.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_000968195.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_000953355.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_017743015.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/Xenorhabdus_k2_8_chr.csv',
 'Results/kmer_counts/Xenorhabdus/GCF_000252955.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Undibacterium/GCF_009937955.1_chr_k2_8_chr.csv',
 'Results/kmer_counts/Undibacterium/GCF

In [144]:
genus_names = list(names_ambigous.keys())
clean_data = defaultdict(int)
for name in genus_names:
    for filename in names_ambigous[name]:
        print(filename)
        with open(filename, 'r') as f:
            csv_file = csv.reader(f)
            for row in csv_file:
                kmer, cnt = row[0], int(row[1])
                if set(kmer).issubset(set('ACGT')):
                    clean_data[kmer] 
            print(row)

Results/kmer_counts/Acinetobacter/Acinetobacter_k2_8_chr.csv


ValueError: invalid literal for int() with base 10: '440897.0'

In [117]:
data = defaultdict(list, [(g, []) for g in genus])

for name in genus:
    if name in names_ambigous:
        data[name].append()
os.path.join('Results/kmer_counts'/p)

In [118]:
dict(zip(genus

['Acinetobacter',
 'Aeromonas',
 'Alteromonas',
 'Anaplasma',
 'Arthrobacter',
 'Bacillus',
 'Bacteroides',
 'Bdellovibrio',
 'Bifidobacterium',
 'Bordetella',
 'Borreliella',
 'Brevundimonas',
 'Brucella',
 'Burkholderia',
 'Campylobacter',
 'Candidatus_Endolissoclinum',
 'Carboxydothermus',
 'Chlamydia',
 'Chlorobaculum',
 'Chromobacterium',
 'Citrobacter',
 'Clostridium',
 'Collimonas',
 'Corynebacterium',
 'Cupriavidus',
 'Dehalobacter',
 'Dehalococcoides',
 'Deinococcus',
 'Dichelobacter',
 'Dickeya',
 'Ectothiorhodospira',
 'Edwardsiella',
 'Enterobacter',
 'Enterococcus',
 'Erwinia',
 'Escherichia',
 'Flavobacterium',
 'Francisella',
 'Gallibacterium',
 'Gardnerella',
 'Geobacillus',
 'Halomonas',
 'Helicobacter',
 'Herbaspirillum',
 'Histophilus',
 'Klebsiella',
 'Komagataeibacter',
 'Lacticaseibacillus',
 'Lactiplantibacillus',
 'Lactobacillus',
 'Lactococcus',
 'Lawsonella',
 'Leclercia',
 'Legionella',
 'Lentilitoribacter',
 'Magnetospirillum',
 'Martelella',
 'Megasphaera',

In [None]:
filenames = ['Downloads/Data/GCF_000016725.1_chr.fna']
for filename in filenames:
    name = filename.split('/')[2]
    print(filename)
    subprocess.call(f'wordcount -sequence {filename} -wordsize {sys.argv[1]} -outfile {name}.wordcount.txt')
#     subprocess.call(['wordcount', 
#                      '-sequence', 
#                      filename, 
#                      '-wordsize', 
#                      sys.argv[1]], 
#                     '-outfile', 
#                     name + '.wordcount.txt')


In [27]:
import os
import sys
import time
import argparse
import glob
import csv
from collections import defaultdict
from itertools import product

In [13]:
names = sorted([])
with open('names_mean.txt', 'r') as fh:
    for line in fh:
        num_file, name = int(line.strip().split(',')[0]), line.strip().split(',')[1]
        if num_file > 1:
            names.append(name)

In [24]:
csv_dic = defaultdict(list, [(n, []) for n in names])
ext = 'csv'
for name in names:
    files = glob.glob(f'Results/kmer_counts/{name}/*.{ext}')
    csv_dic[name] += files

In [30]:
csv_dic

defaultdict(list,
            {'Acidiphilium': ['Results/kmer_counts/Acidiphilium/GCF_000016725.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidiphilium/GCF_000202835.1_chr_k2_8_chr.csv'],
             'Acidipropionibacterium': ['Results/kmer_counts/Acidipropionibacterium/GCF_001602115.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_004011075.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_004011055.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_001441165.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_000310065.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_900637925.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_005890155.1_chr_k2_8_chr.csv',
              'Results/kmer_counts/Acidipropionibacterium/GCF_003956085.1_chr_k2_8_chr.csv',
              'Results/kmer_c

In [26]:
def get_all_possible_kmers(alphabet, kmin, kmax):
    """Returns a list of all possible combinations of k-mers of
    length k from a input alphabet.

    Inputs:

        alphabet - a alphabet (strings characters) that compound the string sequence
        kmin - minimum DNA kmer length (int)
        kmax - maximum DNA kmer length (int)

    Outputs:

        kmers - list of all possible combinations of k-mers of length k with length
                between kmin and kmax.

    """
    kmers = [''.join(letters) for n in range(kmin, kmax + 1)
             for letters in product(alphabet, repeat=n)]
    return kmers

In [28]:
km_list = get_all_possible_kmers('ACGT', 2, 8)

In [47]:
assert len(km_list) == (4**2+4**3+4**4+4**5+4**6+4**7+4**8)

In [33]:
len(csv_dic['Acidiphilium'])

2

In [45]:
def get_kmer_counts_from_csvs(dir_in, name, kmer_list, filenames_dict):
    num_files = len(filenames_dict[name])
    counts = defaultdict(int, [(k, 0) for k in kmer_list])
    for filename in filenames_dict[name]:
        with open(filename, 'r') as fh:
            data = csv.reader(fh)
            for row in data:
                kmer, cnt = row[0], int(row[1])
                if kmer in kmer_list:
                    counts[kmer] += cnt
    return {k: round(cnt / num_files) for k, cnt in counts.items()}

In [48]:
acid = get_kmer_counts_from_csvs('Results/kmer_counts', 'Acidiphilium', km_list, csv_dic)

2


In [51]:
assert len(acid) == (4**2+4**3+4**4+4**5+4**6+4**7+4**8)

In [52]:
%timeit get_kmer_counts_from_csvs('Results/kmer_counts', 'Acidiphilium', km_list, csv_dic)

2
2
2
2
1 loop, best of 3: 58 s per loop


In [54]:
%timeit

!csvjoin -c 1  -H Results/kmer_counts/Acidiphilium/GCF_000202835.1_chr_k2_8_chr.csv Results/kmer_counts/Acidiphilium/GCF_000016725.1_chr_k2_8_chr.csv

a,b,b2
AA,108207,95013
AC,159733,142883
AG,169404,151989
AT,168204,150222
CA,202003,181260
CC,352513,322931
CG,545043,500192
CT,174748,156877
GA,250779,225218
GC,508724,466006
GG,342370,310696
GT,157641,141248
TA,44559,38616
TC,253337,229440
TG,202697,180291
TT,109448,96344
AAA,20523,17660
AAC,32594,28903
AAG,32136,28285
AAT,22953,20164
ACA,25047,22260
ACC,59887,54197
ACG,59826,53554
ACT,14973,12872
AGA,31837,27902
AGC,67993,61329
AGG,55037,50140
AGT,14537,12618
ATA,15038,13221
ATC,76923,69251
ATG,53124,47595
ATT,23119,20155
CAA,30469,26465
CAC,51361,46282
CAG,67213,60692
CAT,52960,47821
CCA,57649,51497
CCC,69974,64527
CCG,167377,154319
CCT,57513,52588
CGA,120720,110040
CGC,200954,185738
CGG,163668,150252
CGT,59701,54162
CTA,11528,10014
CTC,60849,55586
CTG,68811,61432
CTT,33560,29845
GAA,51665,46257
GAC,63006,56513
GAG,59157,53513
GAT,76951,68935
GCA,75911,68729
GCC,166678,153694
GCG,196850,181414
GCT,69285,62169
GGA,55161,49209

TCGCCC,4527,4234
TCGCCG,10124,9411
TCGCCT,2487,2340
TCGCGA,2783,2535
TCGCGC,5665,5423
TCGCGG,4465,4203
TCGCGT,1105,1012
TCGCTA,164,133
TCGCTC,1283,1149
TCGCTG,1504,1332
TCGCTT,456,390
TCGGAA,727,630
TCGGAC,662,628
TCGGAG,577,526
TCGGAT,728,635
TCGGCA,3656,3361
TCGGCC,5154,4849
TCGGCG,8514,7918
TCGGCT,1950,1741
TCGGGA,977,845
TCGGGC,2626,2373
TCGGGG,1206,1085
TCGGGT,800,715
TCGGTA,332,260
TCGGTC,1796,1603
TCGGTG,1742,1550
TCGGTT,461,413
TCGTAA,215,189
TCGTAC,181,153
TCGTAG,556,491
TCGTAT,262,224
TCGTCA,1540,1425
TCGTCC,1810,1648
TCGTCG,4164,3830
TCGTCT,1070,958
TCGTGA,876,802
TCGTGC,2093,1915
TCGTGG,1116,1035
TCGTGT,535,466
TCGTTA,133,106
TCGTTC,978,846
TCGTTG,801,703
TCGTTT,275,219
TCTAAA,51,45
TCTAAC,43,27
TCTAAG,28,23
TCTAAT,41,34
TCTACA,334,309
TCTACC,461,414
TCTACG,656,608
TCTACT,130,118
TCTAGA,37,31
TCTAGC,64,46
TCTAGG,30,21
TCTAGT,52,23
TCTATA,116,107
TCTATC,427,378
TCTATG,453,403
TCTATT,227,201
TCTCAA,282,240
TCTCAC,259

CTAGGGG,7,4
CTAGGGT,6,4
CTAGGTA,5,1
CTAGGTC,11,8
CTAGGTG,3,3
CTAGGTT,4,3
CTAGTAA,4,4
CTAGTAC,2,2
CTAGTAG,9,4
CTAGTAT,1,2
CTAGTCA,4,6
CTAGTCC,13,11
CTAGTCG,19,9
CTAGTCT,12,11
CTAGTGA,6,5
CTAGTGC,7,8
CTAGTGG,11,7
CTAGTGT,23,13
CTAGTTA,2,2
CTAGTTC,8,8
CTAGTTG,7,7
CTAGTTT,16,12
CTATAAA,10,9
CTATAAC,35,28
CTATAAG,8,8
CTATAAT,12,13
CTATACA,9,11
CTATACC,87,87
CTATACG,61,59
CTATACT,8,4
CTATAGA,9,9
CTATAGC,22,15
CTATAGG,18,11
CTATAGT,5,9
CTATATA,8,11
CTATATC,156,140
CTATATG,30,28
CTATATT,24,18
CTATCAA,29,20
CTATCAC,118,106
CTATCAG,89,75
CTATCAT,55,54
CTATCCA,21,17
CTATCCC,107,85
CTATCCG,219,195
CTATCCT,34,25
CTATCGA,39,29
CTATCGC,275,257
CTATCGG,90,75
CTATCGT,41,33
CTATCTA,11,6
CTATCTC,391,350
CTATCTG,180,155
CTATCTT,31,29
CTATGAA,50,52
CTATGAC,152,138
CTATGAG,68,61
CTATGAT,45,35
CTATGCA,33,27
CTATGCC,373,344
CTATGCG,216,188
CTATGCT,34,18
CTATGGA,28,23
CTATGGC,348,310
CTATGGG,41,33
CTATGGT,38,33
CTATGTA,6,7
CTATGTC,269,256
CTAT

TGTGGGT,71,54
TGTGGTA,38,35
TGTGGTC,134,107
TGTGGTG,137,122
TGTGGTT,78,72
TGTGTAA,8,8
TGTGTAC,5,7
TGTGTAG,15,6
TGTGTAT,18,7
TGTGTCA,32,28
TGTGTCC,37,29
TGTGTCG,73,72
TGTGTCT,29,28
TGTGTGA,17,22
TGTGTGC,25,18
TGTGTGG,56,42
TGTGTGT,16,10
TGTGTTA,12,10
TGTGTTC,84,66
TGTGTTG,36,33
TGTGTTT,28,17
TGTTAAA,9,8
TGTTAAC,11,13
TGTTAAG,19,14
TGTTAAT,12,10
TGTTACA,11,14
TGTTACC,22,17
TGTTACG,25,19
TGTTACT,6,9
TGTTAGA,8,9
TGTTAGC,11,12
TGTTAGG,7,1
TGTTAGT,10,8
TGTTATA,6,3
TGTTATC,29,29
TGTTATG,16,10
TGTTATT,15,11
TGTTCAA,161,151
TGTTCAC,238,215
TGTTCAG,179,158
TGTTCAT,247,221
TGTTCCA,233,211
TGTTCCC,139,127
TGTTCCG,337,292
TGTTCCT,248,229
TGTTCGA,547,484
TGTTCGC,486,465
TGTTCGG,540,481
TGTTCGT,227,212
TGTTCTA,49,41
TGTTCTC,173,157
TGTTCTG,203,180
TGTTCTT,172,145
TGTTGAA,118,98
TGTTGAC,119,100
TGTTGAG,107,95
TGTTGAT,162,137
TGTTGCA,90,90
TGTTGCC,261,237
TGTTGCG,242,211
TGTTGCT,97,91
TGTTGGA,54,43
TGTTGGC,153,137
TGTTGGG,66,55
TGTTGGT,111

ACTCTCGC,29,24
ACTCTCGG,35,36
ACTCTCGT,8,6
ACTCTCTA,2,4
ACTCTCTC,5,6
ACTCTCTG,5,5
ACTCTCTT,2,2
ACTCTGAA,8,6
ACTCTGAC,6,1
ACTCTGAG,4,0
ACTCTGAT,5,6
ACTCTGCA,9,11
ACTCTGCC,21,19
ACTCTGCG,12,14
ACTCTGCT,12,6
ACTCTGGA,18,12
ACTCTGGC,27,24
ACTCTGGG,6,7
ACTCTGGT,10,7
ACTCTGTA,0,0
ACTCTGTC,9,5
ACTCTGTG,3,2
ACTCTGTT,8,7
ACTCTTAA,3,2
ACTCTTAC,2,2
ACTCTTAG,1,1
ACTCTTAT,2,1
ACTCTTCA,10,11
ACTCTTCC,11,14
ACTCTTCG,21,20
ACTCTTCT,10,9
ACTCTTGA,5,7
ACTCTTGC,16,4
ACTCTTGG,8,4
ACTCTTGT,5,2
ACTCTTTA,5,5
ACTCTTTC,7,7
ACTCTTTG,3,1
ACTCTTTT,3,6
ACTGAAAA,11,9
ACTGAAAC,13,9
ACTGAAAG,8,3
ACTGAAAT,14,11
ACTGAACA,13,8
ACTGAACC,22,16
ACTGAACG,32,19
ACTGAACT,6,7
ACTGAAGA,19,13
ACTGAAGC,17,19
ACTGAAGG,20,15
ACTGAAGT,9,6
ACTGAATA,6,5
ACTGAATC,7,3
ACTGAATG,11,8
ACTGAATT,6,6
ACTGACAA,9,7
ACTGACAC,9,10
ACTGACAG,10,7
ACTGACAT,17,15
ACTGACCA,23,20
ACTGACCC,28,19
ACTGACCG,61,53
ACTGACCT,13,13
ACTGACGA,18,21
ACTGACGC,30,32
ACTGACGG,34,30
ACTGACGT,8,7
ACTGA

CAACATCA,65,64
CAACATCC,71,62
CAACATCG,123,115
CAACATCT,46,39
CAACATGA,59,52
CAACATGC,54,48
CAACATGG,56,56
CAACATGT,19,16
CAACATTA,5,7
CAACATTC,22,20
CAACATTG,14,11
CAACATTT,8,9
CAACCAAA,9,5
CAACCAAC,12,7
CAACCAAG,7,6
CAACCAAT,15,14
CAACCACA,36,33
CAACCACC,54,42
CAACCACG,56,49
CAACCACT,21,12
CAACCAGA,56,52
CAACCAGC,99,83
CAACCAGG,71,67
CAACCAGT,30,34
CAACCATA,16,9
CAACCATC,40,36
CAACCATG,36,34
CAACCATT,12,13
CAACCCAA,8,7
CAACCCAC,16,13
CAACCCAG,10,11
CAACCCAT,25,15
CAACCCCA,23,25
CAACCCCC,29,26
CAACCCCG,115,100
CAACCCCT,38,38
CAACCCGA,97,80
CAACCCGC,154,150
CAACCCGG,126,115
CAACCCGT,50,49
CAACCCTA,9,6
CAACCCTC,20,32
CAACCCTG,47,43
CAACCCTT,14,11
CAACCGAA,28,20
CAACCGAC,29,18
CAACCGAG,32,22
CAACCGAT,31,22
CAACCGCA,95,88
CAACCGCC,159,144
CAACCGCG,187,163
CAACCGCT,81,76
CAACCGGA,80,62
CAACCGGC,189,166
CAACCGGG,91,70
CAACCGGT,39,33
CAACCGTA,5,9
CAACCGTC,41,37
CAACCGTG,40,32
CAACCGTT,15,13
CAACCTAA,3,3
CAACCTAC,6,2
CAACCTAG,0,2


CGACGCGT,67,70
CGACGCTA,12,7
CGACGCTC,144,133
CGACGCTG,351,306
CGACGCTT,42,34
CGACGGAA,67,63
CGACGGAC,73,63
CGACGGAG,91,64
CGACGGAT,91,90
CGACGGCA,291,233
CGACGGCC,262,256
CGACGGCG,791,704
CGACGGCT,104,101
CGACGGGA,96,89
CGACGGGC,278,249
CGACGGGG,148,124
CGACGGGT,146,76
CGACGGTA,25,22
CGACGGTC,139,141
CGACGGTG,249,216
CGACGGTT,57,49
CGACGTAA,18,14
CGACGTAC,6,4
CGACGTAG,36,35
CGACGTAT,22,16
CGACGTCA,109,107
CGACGTCC,74,75
CGACGTCG,256,245
CGACGTCT,63,67
CGACGTGA,141,135
CGACGTGC,198,192
CGACGTGG,128,113
CGACGTGT,68,63
CGACGTTA,8,6
CGACGTTC,102,99
CGACGTTG,74,55
CGACGTTT,30,20
CGACTAAA,6,2
CGACTAAC,1,1
CGACTAAG,1,0
CGACTAAT,1,0
CGACTACA,51,53
CGACTACC,39,39
CGACTACG,83,69
CGACTACT,9,8
CGACTAGA,2,2
CGACTAGC,4,1
CGACTAGG,4,6
CGACTAGT,2,3
CGACTATA,13,20
CGACTATC,59,53
CGACTATG,59,59
CGACTATT,33,29
CGACTCAA,10,9
CGACTCAC,7,9
CGACTCAG,13,10
CGACTCAT,14,11
CGACTCCA,18,17
CGACTCCC,39,31
CGACTCCG,53,53
CGACTCCT,30,30
CGACTCGA,61,48
CG

GACCAGGG,142,126
GACCAGGT,109,101
GACCAGTA,28,21
GACCAGTC,48,47
GACCAGTG,41,37
GACCAGTT,89,73
GACCATAA,6,5
GACCATAC,23,17
GACCATAG,11,10
GACCATAT,19,15
GACCATCA,110,105
GACCATCC,103,96
GACCATCG,234,204
GACCATCT,92,96
GACCATGA,113,93
GACCATGC,185,179
GACCATGG,87,80
GACCATGT,88,81
GACCATTA,9,8
GACCATTC,40,34
GACCATTG,27,24
GACCATTT,31,32
GACCCAAA,12,4
GACCCAAC,11,14
GACCCAAG,12,9
GACCCAAT,15,7
GACCCACA,17,19
GACCCACC,28,27
GACCCACG,30,32
GACCCACT,7,3
GACCCAGA,60,43
GACCCAGC,69,55
GACCCAGG,79,82
GACCCAGT,26,19
GACCCATA,13,16
GACCCATC,43,42
GACCCATG,51,44
GACCCATT,25,21
GACCCCAA,8,13
GACCCCAC,31,24
GACCCCAG,26,19
GACCCCAT,22,15
GACCCCCA,22,17
GACCCCCC,12,10
GACCCCCG,42,39
GACCCCCT,32,24
GACCCCGA,150,141
GACCCCGC,180,181
GACCCCGG,106,82
GACCCCGT,43,38
GACCCCTA,12,10
GACCCCTC,18,12
GACCCCTG,30,19
GACCCCTT,24,21
GACCCGAA,57,46
GACCCGAC,70,55
GACCCGAG,34,30
GACCCGAT,90,80
GACCCGCA,151,132
GACCCGCC,202,185
GACCCGCG,210,206
GACCCGCT,13

GGACGATT,46,36
GGACGCAA,32,23
GGACGCAC,30,25
GGACGCAG,68,54
GGACGCAT,36,33
GGACGCCA,94,69
GGACGCCC,72,60
GGACGCCG,276,237
GGACGCCT,71,61
GGACGCGA,99,85
GGACGCGC,221,210
GGACGCGG,245,214
GGACGCGT,43,30
GGACGCTA,23,15
GGACGCTC,47,46
GGACGCTG,71,69
GGACGCTT,25,23
GGACGGAA,62,45
GGACGGAC,49,39
GGACGGAG,41,42
GGACGGAT,71,50
GGACGGCA,171,167
GGACGGCC,155,127
GGACGGCG,355,313
GGACGGCT,92,84
GGACGGGA,75,63
GGACGGGC,200,186
GGACGGGG,121,96
GGACGGGT,79,79
GGACGGTA,16,12
GGACGGTC,64,56
GGACGGTG,111,88
GGACGGTT,43,34
GGACGTAA,12,12
GGACGTAC,5,2
GGACGTAG,22,22
GGACGTAT,16,16
GGACGTCA,45,39
GGACGTCC,32,26
GGACGTCG,94,83
GGACGTCT,20,22
GGACGTGA,60,71
GGACGTGC,82,74
GGACGTGG,66,55
GGACGTGT,21,15
GGACGTTA,6,5
GGACGTTC,42,40
GGACGTTG,38,28
GGACGTTT,20,19
GGACTAAA,2,2
GGACTAAC,1,2
GGACTAAG,3,3
GGACTAAT,1,2
GGACTACA,13,9
GGACTACC,18,11
GGACTACG,23,23
GGACTACT,5,4
GGACTAGA,2,2
GGACTAGC,8,4
GGACTAGG,3,3
GGACTAGT,0,0
GGACTATA,9,7
GGACTATC,20,18
G

GTGCCTGG,46,36
GTGCCTGT,27,22
GTGCCTTA,5,3
GTGCCTTC,50,48
GTGCCTTG,37,25
GTGCCTTT,15,12
GTGCGAAA,29,22
GTGCGAAC,30,22
GTGCGAAG,30,20
GTGCGAAT,34,33
GTGCGACA,40,38
GTGCGACC,63,52
GTGCGACG,91,59
GTGCGACT,20,17
GTGCGAGA,47,41
GTGCGAGC,49,37
GTGCGAGG,75,66
GTGCGAGT,12,11
GTGCGATA,23,21
GTGCGATC,72,58
GTGCGATG,60,46
GTGCGATT,14,11
GTGCGCAA,90,90
GTGCGCAC,45,41
GTGCGCAG,148,122
GTGCGCAT,87,66
GTGCGCCA,119,98
GTGCGCCC,88,74
GTGCGCCG,243,214
GTGCGCCT,83,75
GTGCGCGA,290,260
GTGCGCGC,270,242
GTGCGCGG,268,262
GTGCGCGT,81,81
GTGCGCTA,24,23
GTGCGCTC,86,85
GTGCGCTG,93,69
GTGCGCTT,71,69
GTGCGGAA,101,97
GTGCGGAC,86,76
GTGCGGAG,50,48
GTGCGGAT,224,224
GTGCGGCA,160,143
GTGCGGCC,232,215
GTGCGGCG,421,394
GTGCGGCT,168,152
GTGCGGGA,103,93
GTGCGGGC,228,219
GTGCGGGG,74,74
GTGCGGGT,121,114
GTGCGGTA,38,33
GTGCGGTC,73,72
GTGCGGTG,67,71
GTGCGGTT,58,48
GTGCGTAA,4,3
GTGCGTAC,6,3
GTGCGTAG,9,4
GTGCGTAT,12,5
GTGCGTCA,37,30
GTGCGTCC,36,35
GTGCGTCG,72,54
GTGCGT

TCACCTGT,20,19
TCACCTTA,5,6
TCACCTTC,187,168
TCACCTTG,90,89
TCACCTTT,19,20
TCACGAAA,48,54
TCACGAAC,69,66
TCACGAAG,67,60
TCACGAAT,30,31
TCACGACA,39,37
TCACGACC,102,83
TCACGACG,84,73
TCACGACT,15,14
TCACGAGA,45,39
TCACGAGC,39,34
TCACGAGG,73,71
TCACGAGT,4,7
TCACGATA,26,26
TCACGATC,196,180
TCACGATG,101,93
TCACGATT,26,16
TCACGCAA,29,24
TCACGCAC,31,22
TCACGCAG,62,54
TCACGCAT,28,20
TCACGCCA,65,53
TCACGCCC,132,119
TCACGCCG,333,337
TCACGCCT,34,26
TCACGCGA,54,46
TCACGCGC,124,125
TCACGCGG,115,104
TCACGCGT,19,14
TCACGCTA,11,6
TCACGCTC,145,145
TCACGCTG,189,174
TCACGCTT,32,29
TCACGGAA,33,24
TCACGGAC,15,16
TCACGGAG,31,20
TCACGGAT,31,33
TCACGGCA,96,80
TCACGGCC,86,81
TCACGGCG,221,188
TCACGGCT,29,29
TCACGGGA,19,17
TCACGGGC,62,48
TCACGGGG,34,22
TCACGGGT,20,15
TCACGGTA,21,18
TCACGGTC,102,87
TCACGGTG,136,120
TCACGGTT,20,19
TCACGTAA,18,19
TCACGTAC,14,7
TCACGTAG,27,31
TCACGTAT,5,4
TCACGTCA,35,27
TCACGTCC,73,56
TCACGTCG,155,137
TCACGTCT,15,15
TCACGTG

TGGCGATC,427,350
TGGCGATG,392,351
TGGCGATT,64,60
TGGCGCAA,82,69
TGGCGCAC,148,120
TGGCGCAG,307,309
TGGCGCAT,135,121
TGGCGCCA,115,107
TGGCGCCC,147,147
TGGCGCCG,705,631
TGGCGCCT,98,89
TGGCGCGA,225,215
TGGCGCGC,486,444
TGGCGCGG,656,571
TGGCGCGT,97,79
TGGCGCTA,24,24
TGGCGCTC,194,191
TGGCGCTG,479,429
TGGCGCTT,79,76
TGGCGGAA,152,143
TGGCGGAC,171,162
TGGCGGAG,197,186
TGGCGGAT,206,179
TGGCGGCA,294,258
TGGCGGCC,351,329
TGGCGGCG,1055,962
TGGCGGCT,178,151
TGGCGGGA,163,133
TGGCGGGC,360,324
TGGCGGGG,192,181
TGGCGGGT,105,95
TGGCGGTA,36,30
TGGCGGTC,152,153
TGGCGGTG,302,291
TGGCGGTT,77,65
TGGCGTAA,14,17
TGGCGTAC,7,5
TGGCGTAG,59,52
TGGCGTAT,22,18
TGGCGTCA,55,44
TGGCGTCC,66,58
TGGCGTCG,220,194
TGGCGTCT,24,28
TGGCGTGA,67,63
TGGCGTGC,131,107
TGGCGTGG,126,99
TGGCGTGT,31,26
TGGCGTTA,13,8
TGGCGTTC,142,124
TGGCGTTG,103,105
TGGCGTTT,38,26
TGGCTAAA,3,5
TGGCTAAC,4,3
TGGCTAAG,1,0
TGGCTAAT,2,2
TGGCTACA,28,13
TGGCTACC,19,11
TGGCTACG,18,15
TGGCTACT,4,3
TGGCTA

In [3]:
def get_pur_pyr_counts(sequence):
    """
    Function to calculate the frequency of purines (A or G) and
    pyrimidines (T, C or U) in a sequence.

    Inputs:
        sequence - a stringt object representing a sequence (DNA,
        RNA).

    Outputs:
        pur - a integer number representing the total number of
              purines in a sequence.
        pyr - a integer number representing the total number of
              pyrimidines in a sequence.
    """
    pur = sequence.count('A') + sequence.count('G')
    pyr = sequence.count('T') + sequence.count('C') + sequence.count('U')
    return pur, pyr

In [5]:
LDICT = dict(zip('ACGTacgt',  range(8)))

In [6]:
def  is_nucleotide(letter): 
    return  letter  in  LDICT 

In [7]:
is_nucleotide('A')

True

In [12]:
{(a,  b):  (LDICT[a],  LDICT[b], LDICT[c]) for  a,  b, c  in  product(LDICT,  LDICT, LDICT)}

{('A', 'A'): (0, 0, 7),
 ('A', 'C'): (0, 1, 7),
 ('A', 'G'): (0, 2, 7),
 ('A', 'T'): (0, 3, 7),
 ('A', 'a'): (0, 4, 7),
 ('A', 'c'): (0, 5, 7),
 ('A', 'g'): (0, 6, 7),
 ('A', 't'): (0, 7, 7),
 ('C', 'A'): (1, 0, 7),
 ('C', 'C'): (1, 1, 7),
 ('C', 'G'): (1, 2, 7),
 ('C', 'T'): (1, 3, 7),
 ('C', 'a'): (1, 4, 7),
 ('C', 'c'): (1, 5, 7),
 ('C', 'g'): (1, 6, 7),
 ('C', 't'): (1, 7, 7),
 ('G', 'A'): (2, 0, 7),
 ('G', 'C'): (2, 1, 7),
 ('G', 'G'): (2, 2, 7),
 ('G', 'T'): (2, 3, 7),
 ('G', 'a'): (2, 4, 7),
 ('G', 'c'): (2, 5, 7),
 ('G', 'g'): (2, 6, 7),
 ('G', 't'): (2, 7, 7),
 ('T', 'A'): (3, 0, 7),
 ('T', 'C'): (3, 1, 7),
 ('T', 'G'): (3, 2, 7),
 ('T', 'T'): (3, 3, 7),
 ('T', 'a'): (3, 4, 7),
 ('T', 'c'): (3, 5, 7),
 ('T', 'g'): (3, 6, 7),
 ('T', 't'): (3, 7, 7),
 ('a', 'A'): (4, 0, 7),
 ('a', 'C'): (4, 1, 7),
 ('a', 'G'): (4, 2, 7),
 ('a', 'T'): (4, 3, 7),
 ('a', 'a'): (4, 4, 7),
 ('a', 'c'): (4, 5, 7),
 ('a', 'g'): (4, 6, 7),
 ('a', 't'): (4, 7, 7),
 ('c', 'A'): (5, 0, 7),
 ('c', 'C'): (5,

In [9]:
PDICT  =  {(a,  b):  (LDICT[a],  LDICT[b]) for  a,  b  in  product(LDICT,  LDICT)}

In [10]:
PDICT

{('A', 'A'): (0, 0),
 ('A', 'C'): (0, 1),
 ('A', 'G'): (0, 2),
 ('A', 'T'): (0, 3),
 ('A', 'a'): (0, 4),
 ('A', 'c'): (0, 5),
 ('A', 'g'): (0, 6),
 ('A', 't'): (0, 7),
 ('C', 'A'): (1, 0),
 ('C', 'C'): (1, 1),
 ('C', 'G'): (1, 2),
 ('C', 'T'): (1, 3),
 ('C', 'a'): (1, 4),
 ('C', 'c'): (1, 5),
 ('C', 'g'): (1, 6),
 ('C', 't'): (1, 7),
 ('G', 'A'): (2, 0),
 ('G', 'C'): (2, 1),
 ('G', 'G'): (2, 2),
 ('G', 'T'): (2, 3),
 ('G', 'a'): (2, 4),
 ('G', 'c'): (2, 5),
 ('G', 'g'): (2, 6),
 ('G', 't'): (2, 7),
 ('T', 'A'): (3, 0),
 ('T', 'C'): (3, 1),
 ('T', 'G'): (3, 2),
 ('T', 'T'): (3, 3),
 ('T', 'a'): (3, 4),
 ('T', 'c'): (3, 5),
 ('T', 'g'): (3, 6),
 ('T', 't'): (3, 7),
 ('a', 'A'): (4, 0),
 ('a', 'C'): (4, 1),
 ('a', 'G'): (4, 2),
 ('a', 'T'): (4, 3),
 ('a', 'a'): (4, 4),
 ('a', 'c'): (4, 5),
 ('a', 'g'): (4, 6),
 ('a', 't'): (4, 7),
 ('c', 'A'): (5, 0),
 ('c', 'C'): (5, 1),
 ('c', 'G'): (5, 2),
 ('c', 'T'): (5, 3),
 ('c', 'a'): (5, 4),
 ('c', 'c'): (5, 5),
 ('c', 'g'): (5, 6),
 ('c', 't'): 

In [13]:
  np.zeros((8,  8))

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.]])

In [21]:
list(concat(sliding_window(2, 'acagggtatag')))

['a',
 'c',
 'c',
 'a',
 'a',
 'g',
 'g',
 'g',
 'g',
 'g',
 'g',
 't',
 't',
 'a',
 'a',
 't',
 't',
 'a',
 'a',
 'g']

In [39]:
import gzip
import  toolz  as  tz 
from  toolz  import  curried  as  c

In [44]:
dm = 'Data/Genomes_splitted/Thermodesulfatator/Chromosomes/GCF_000217795.1_chr.fna.gz'


# for name, sequence in fasta_parser.parse_fasta(genome):
#     seq = sequence.upper()

In [34]:
def  is_sequence(line): 
    return  not  line.startswith('>') 

def  is_nucleotide(letter): 
    return  letter  in  LDICT   # ignore 'N'

@tz.curry 
def  increment_model(model,  index): 
    model[index]  +=  1

In [25]:
LDICT  =  dict(zip('ACGTacgt',  range(8))) 
PDICT  =  {(a,  b):  (LDICT[a],  LDICT[b]) for  a,  b  in  product(LDICT,  LDICT)}

In [38]:
# def  genome(file_pattern): """Stream a genome, letter by letter, from a list of FASTA filenames.""" 
#     return  tz.pipe(file_pattern,  glob,  sorted,   # Filenames 
#                     c.map(open),   # lines # concatenate lines from all files: 
#                     tz.concat, # drop header from each sequence 
#                     c.filter(is_sequence), # concatenate characters from all lines 
#                     tz.concat, # discard newlines and 'N' 
#                     c.filter(is_nucleotide))

In [41]:
gzopen  =  tz.curry(gzip.open) 

def  genome_gz(file_pattern): 
    """Stream a genome, letter by letter, from a list of FASTA filenames.""" 
    return  tz.pipe(file_pattern,  glob,  sorted,   # Filenames 
                    c.map(gzopen(mode='rt')),   # lines # concatenate lines from all files: 
                    tz.concat, # drop header from each sequence 
                    c.filter(is_sequence),
                    # concatenate characters from all lines 
                    tz.concat, # discard newlines and 'N' 
                    c.filter(is_nucleotide))

In [32]:
def  markov(seq): 
    """Get a 1st-order Markov model from a sequence of nucleotides.""" 
    model  =  np.zeros((8,  8)) 
    # each successive tuple
    tz.last(tz.pipe(seq, c.sliding_window(2),
                    # location in matrix of tuple
                    c.map(PDICT.__getitem__), 
                    # increment matrix
                    c.map(increment_model(model))))    
    # convert counts to transition probability matrix 
    model  /=  np.sum(model,  axis=1)[:,  np.newaxis] 
    return  model

In [45]:
model  =  tz.pipe(dm, genome_gz,  markov)

TypeError: 'module' object is not callable

In [36]:
plot_model(model,  labels='ACGTacgt')

array([[0.38539286, 0.15431858, 0.23059655, 0.22969201, 0.        ,
        0.        , 0.        , 0.        ],
       [0.24694473, 0.27095431, 0.16634877, 0.3157522 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.26647467, 0.25852512, 0.2678409 , 0.2071593 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.23543817, 0.19885188, 0.18047655, 0.3852334 , 0.        ,
        0.        , 0.        , 0.        ],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan,
               nan,        nan,        nan]])

In [26]:
def  plot_model(model,  labels,  figure=None): 
    fig  =  figure  or  plt.figure() 
    ax  =  fig.add_axes([0.1,  0.1,  0.8,  0.8]) 
    im  =  ax.imshow(model,  cmap='magma'); 
    axcolor  =  fig.add_axes([0.91,  0.1,  0.02,  0.8]) 
    plt.colorbar(im,  cax=axcolor) 
    for  axis  in  [ax.xaxis,  ax.yaxis]: 
        axis.set_ticks(range(8)) 
        axis.set_ticks_position('none') 
        axis.set_ticklabels(labels) 
    return  ax

In [None]:
plot_model(model,  labels='ACGTacgt');

In [22]:
def palindrome_middle_check(palindrome_list, check_middle=1):
    grouped = []
    for pal in palindrome_list:
        # one base in the middle
        if check_middle == 1:
            half = len(pal) // 2
            pref = pal[:half]
            print('p', pref)
            suf = pal[half+1:]
            print('s', suf)
            mid = pal[half]
            print('m', mid)
            grouped.append(pal)
        elif check_middle == 2:
            half = len(pal)
            pref = pal[:half]
            print('p', pref)
            suf = pal[half:]
            print('s', suf)
            mid = pal[half-1:half+2]
            print('m', mid)
    return grouped 

In [4]:
csv_files = glob.glob('Results/kmer_counts/Acidiphilium/Acidiphilium_k2_8_chr.csv')

In [5]:
csv_files

['Results/kmer_counts/Acidiphilium/Acidiphilium_k2_8_chr.csv']

In [7]:
kmer_counts = get_kmer_cnts_from_csv(csv_files[0])

In [27]:
kmer_counts

defaultdict(int,
            {'AA': 101610,
             'AC': 151308,
             'AG': 160696,
             'AT': 159213,
             'CA': 191631,
             'CC': 337722,
             'CG': 522617,
             'CT': 165812,
             'GA': 237998,
             'GC': 487365,
             'GG': 326533,
             'GT': 149444,
             'TA': 41587,
             'TC': 241388,
             'TG': 191494,
             'TT': 102896,
             'AAA': 19091,
             'AAC': 30748,
             'AAG': 30210,
             'AAT': 21558,
             'ACA': 23653,
             'ACC': 57042,
             'ACG': 56690,
             'ACT': 13922,
             'AGA': 29869,
             'AGC': 64661,
             'AGG': 52588,
             'AGT': 13577,
             'ATA': 14129,
             'ATC': 73087,
             'ATG': 50359,
             'ATT': 21637,
             'CAA': 28467,
             'CAC': 48821,
             'CAG': 63952,
             'CAT': 50390,
            

In [28]:
def kmer_frequencies_from_dict(kmer_counts, k):
    kmer_list = groupby(len, grouped)[k]
    km_cnt = defaultdict(int, [(k, 0) for k in kmer_list])
    for kmer in kmer_list:
        km_cnt[kmer] += kmer_counts[kmer]
    total = sum(km_cnt.values())
    return {k: cnt / total for k, cnt in km_cnt.items()}

In [32]:
sum(kmer_frequencies_from_dict(kmer_counts, 4).values())

0.9999999999999998

In [35]:
kmer_frequencies_from_dict(kmer_counts, 4)

{'AAAA': 0.0011520592077743825,
 'AAAC': 0.0013773159205785175,
 'AAAG': 0.0011282447543062835,
 'AAAT': 0.0016911063662758202,
 'AACA': 0.0018905874353262481,
 'AACC': 0.0025456249907193675,
 'AACG': 0.002828876901969343,
 'AACT': 0.0013495790865392023,
 'AAGA': 0.0017650712570473272,
 'AAGC': 0.003081310108731191,
 'AAGG': 0.0028602559465390736,
 'AAGT': 0.0007570194502447426,
 'AATA': 0.0009178370536646101,
 'AATC': 0.0021203268687832017,
 'AATG': 0.0016485205200740434,
 'AATT': 0.0013532212970696175,
 'ACAA': 0.0012697306249108708,
 'ACAC': 0.0014818193457973514,
 'ACAG': 0.001742657653783234,
 'ACAT': 0.002132374180537652,
 'ACCA': 0.0034029453155709263,
 'ACCC': 0.003070663647180747,
 'ACCG': 0.006765265975225684,
 'ACCT': 0.002742304359361784,
 'ACGA': 0.004418561713475142,
 'ACGC': 0.005288769860203555,
 'ACGG': 0.004360846685070103,
 'ACGT': 0.001814381184228332,
 'ACTA': 0.00037766921499996777,
 'ACTC': 0.0011456152968359558,
 'ACTG': 0.001586042600975384,
 'ACTT': 0.00079120

In [34]:
import math

def signif(x, digits=6):
    if x == 0 or not math.isfinite(x):
        return x
    digits -= math.ceil(math.log10(abs(x)))
    return round(x, digits)

In [37]:
signif(0.001742657653783234)

0.00174266

In [None]:
def get_kmer_frequency(csv_file):
    tot = 0
    freq = defaultdict(float)
    with open(csv_file, 'r') as fh:
        data = csv.reader(fh)
        for row in data:
            kmer, cnt = row[0], float(row[1])
            freq[kmer] = freq.get(kmer, 0.0) + cnt
            tot += cnt
    return {k: cnt / tot for k, cnt in freq.items()}

In [None]:
pd.read_csv('Results/kmer_model/Thermovirga/Thermovirga_k4_chr.model.csv', header=0)

In [None]:
for na, seq in fasta_parser.parse_fasta('Data/Genomes_splitted/Acidiphilium/Chromosomes/GCF_000202835.1_chr.fna.gz'):
    seq = seq

In [None]:
file_names = glob.glob(f'Results/kmer_counts/*/*_k2_8_chr.csv')

In [None]:
# def kmer_count(seq, alphabet, k):
#     '''
#     Return a list of the number of times each possible k-mer appears
#     in seq, including overlapping occurrences.
#     '''
#     rv = {}
#     for i in range(0, len(seq) - k + 1):
#         kmer = seq[i:i+k]
#         v = rv.get(kmer, 0)
#         if set(kmer).issubset(alphabet):
#             rv[kmer] = v + 1
#     return rv

In [None]:
d_l = []
cnt = 0
for i, filename in enumerate(fasta_files['Acidiphilium']):
    for name, seq in fasta_parser.parse_fasta(filename):
        data = count_kmers.count_kmers(seq.upper(), 2, 8)
        d_l.append(data)
        cnt += 1

In [None]:
c = Counter()

for d in d_l:
    c.update(d)
c

In [None]:
def get_kmer_counts_from_sequences(filenames, name, kmin, kmax):
    tot = len(filenames[name])
    kmer_dic = Counter()
    for i, filename in enumerate(filenames[name]):
        for Id, seq in fasta_parser.parse_fasta(filename):
            counts = count_kmers.count_kmers(seq.upper(), kmin, kmax)
            kmer_dic.update(counts)
    return {k:int(v/tot) for k, v in kmer_dic.items()}

In [None]:
kmc = get_kmer_counts_from_sequences(fasta_files, 'Methyloligella', 2, 4)

In [None]:
df = pd.DataFrame(kmc.items())

In [None]:
df

In [None]:
new_dict

In [None]:
new_dict2 = {k:int(v/cnt) for d in d_l for (k,v) in d.items()}

In [None]:
201934/5

In [None]:
new_dict2

In [None]:
# from functools import reduce
# result_dict = reduce(lambda a, b: {**a, **b}, d_l)
# result_dict

In [None]:
def get_kmer_counts_from_genomes(filenames, name, kmin, kmax):
    dic_list = []
    IDS = []
    for i, filename in enumerate(filenames[name]):
        for Id, seq in fasta_parser.parse_fasta(filename):
            counts = count_kmers.count_kmers(seq.upper(), kmin, kmax)
            dic_list.append(counts)
            IDS.append(Id)
    df = pd.DataFrame(dic_list).T.reset_index()
    df['mean'] = df.mean(axis=1).astype(int)
    df = df[['index', 'mean']]
    return df.rename(columns={'index': 'kmer', 'mean': 'obs'})

In [None]:
filenames = {'influenzae': ['Data/Test/Chromosomes/H.influenzae.fa.gz']}
kc = get_kmer_counts_from_genomes(filenames, 'influenzae', 4, 6)

In [None]:
km = kc['kmer'].to_list()
cnts = kc['obs'].to_list()
cnt_kmers = dict(zip(km, cnts))

In [None]:
cnt_kmers['TTTTT']

In [None]:
from markov_models import expected_kmer_by_zom, get_expected_higher_markov, get_variance, \
get_standard_deviation, get_z_scores, get_p_values, get_e_values

In [13]:
def get_all_possible_kmers(alphabet, kmin, kmax):
    """Returns a list of all possible combinations of k-mers of
    length k from a input alphabet.

    Inputs:

        alphabet - a alphabet (strings characters) that compound the string sequence
        kmin - minimum DNA kmer length (int)
        kmax - maximum DNA kmer length (int)

    Outputs:

        kmers - list of all possible combinations of k-mers of length k with length
                between kmin and kmax.

    """
    kmers = [''.join(letters) for n in range(kmin, kmax + 1)
             for letters in product(alphabet, repeat=n)]
    return kmers

In [None]:
kmer_list_6 = get_all_possible_kmers('ACGT', 6, 6)

In [None]:
def base_stats(sequence, alphabet, as_count=False, as_dict=False):
    """Calculates de frequency or the number of bases in a sequence.
    
    Inputs:
    
        sequence - string representing the sequence
        alphabet - a alphabet (strings characters) that compound the string sequence
        as_count - boolean set as False
        as_dict - boolean set as False
    
    Output:
    
        counts - as default returns a numpy array as frequencies (floats) or
                 as a dictionary-like object
    
    Examples:
    
    > baseFreqs(seq, 'ACGT', asCounts = False, asDict = False)
    array([0.25, 0.25, 0.25, 0.25])

    as_count - True, returns a numpy array of counts (integer)
    > baseFreqs('ACGTACGT', 'ACGT', asCounts = True, asDict = False)
    array([2, 2, 2, 2])

    as_dict - True and as_count as default (False) returns a dictionary as bases frequencies (float)
    > baseFreqs('ACGTACGT', 'ACGT', asCounts = False, asDict = True)
    {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

    as_count True and as_dict True, returns a dictionary as base counts (integer)
    > baseFreqs('ACGTACGT', 'ACGT', asCounts = True, asDict = True)
    {'A': 2, 'C': 2, 'G': 2, 'T': 2}
    """
    # make the sequence upper case
    seq = sequence.upper()
    # count all bases in sequence and collect as an array
    counts = np.array([seq.count(i) for i in alphabet])
    # if is onle the counts
    if as_count:
        freqs = counts
    # other wise as frequencies
    else:
        freqs = counts / sum(counts * 1.0)
    # or as a dictionary like object
    if as_dict:
        return dict(zip(alphabet, freqs))
    else:
        return freqs

In [None]:
def get_base_frequencies(filenames, name, alphabet):
    seqs = ''
    for filename in filenames[name]:
        for n, seq in fasta_parser.parse_fasta(filename):
            seqs += seq
    freqs = base_stats(seq, alphabet, as_dict=True)
    return freqs

In [None]:
bases = get_base_frequencies(filenames, 'influenzae', 'ACGT')

In [None]:
bases

In [None]:
sum(bases.values())

In [None]:
hinf_len = 1830140

In [None]:
def get_length_from_csv(filename):
    length = defaultdict(int)
    with open(filename, 'r') as fh:
        csv_reader = csv.reader(fh)
        for row in csv_reader:
            genus, gen_len = row[0], row[1]
            length[genus] = length.get(genus, 0) + int(gen_len)
    return length

In [None]:
len_seq = get_length_from_csv('Results/Length/All_Chromosomes_length.csv')

In [None]:
def z_scores(kmer_exp, kmer_counts, std):
    """
    Calculates the z scores to under/over represented kmers from a sequence.
    The score is calculaated as:
    
    Z(W) = (C(W) – E(C(W))) / sigma(W), where 
    C(w) - observed values
    E(C(w)) - represents the expected value from a kmer
    sigma - represents the standard deviation
    
    Inputs:
        kmer_exp - dictionary-like object mapping kmer of length k to their 
                   calculated expected values.
        kmer_counts - dictionary-like object mapping kmer to their counts. The
                      kmer lengths must be between kmin (kmx-2) and kmax.
        std - a dictionary-like object mapping kmer to their calculated
                   expectd std.
    
    Outputs:
        z_scores - dictionary-like object mapping kmer to their z_scores.
    """
    # initialize the container
    z_scores = defaultdict(float)
    # iterates through the kmer keys
    for kmer in kmer_exp:
        # gets the kmer std value
        sd = std[kmer]
        # deals with zero error division
        if sd == 0.0:
            z_scores[kmer] = z_scores.get(kmer, 0.0)
        else:
            # calculates the z score and add 
            # the kmer and the z score values to the container
            z = (kmer_counts[kmer] - kmer_exp[kmer]) / sd
            z_scores[kmer] = z
    return z_scores   

In [None]:
def get_scipy_p_values(z_scores_kmers):
    """
    Calculates the p value for all kmers.
    The calculation is done as:
    over represented: P(z > t) = erfc(t/sqrt(2))/2
    under represented: P(z > t) = erfc(-t/sqrt(2))/2
    t: thresholder
    
    Inputs:
        z_scores_kmers - dictionary-like object mapping kmer to their z_scores.
        
    Outputs:
        p_vals - dictionary-like object mapping kmer to their p values.
    """
    # initialize the container
    p_vals = defaultdict(float)
    # iterates through the kmer keys
    for kmer in z_scores_kmers:
        # calculates the p values to under represented
        # kmers (negative z scores)
        # add the kmer and p values to the container
        if z_scores_kmers[kmer] < 0.0:
            p_vals[kmer] = p_vals.get(kmer, 0.0) + norm.sf(abs(-z_scores_kmers[kmer]))
        else:
            # add the kmer and p values to the container to over represented
            # and all other kmers
            p_vals[kmer] = p_vals.get(kmer, 0.0) + norm.sf(abs(z_scores_kmers[kmer]))
    return p_vals

In [None]:
expected_kmer_by_zom('AAAAA', bases, len_seq['Acidiphilium'])

In [None]:
exp = get_expected_higher_markov(kmer_list_6, cnt_kmers)

In [None]:
zscr = get_z_scores(kmer_list_6, cnt_kmers, exp, hinf_len)

In [None]:
pval = get_p_values(zscr)

In [None]:
e_val = get_e_values(kmer_list_6, pval)

In [None]:
def get_kmer_stats(kmer_list, kmer_count, expected, z_scores, e_vals, p_vals):
    data = []
    for kmer in kmer_list:
        data.append((km, cnt_kmers[km], exp[km], zscr[km], e_val[km], pval[km]))
    df = pd.DataFrame(data, 
                      columns=['kmer', 
                               'observed',
                               'expected',
                               'z_score',
                               'e_value',
                               'p_value']).sort_values(by='z_score').reset_index(drop=True)
    return df

In [None]:
data = []
for km in kmer_list_6:
    data.append((km, cnt_kmers[km], exp[km], zscr[km], e_val[km], pval[km]))

In [None]:
hinf = pd.DataFrame(data, columns=['kmer', 
                            'observed',
                            'expected',
                            'z_score',
                            'e_value',
                            'p_value']).sort_values(by='z_score').reset_index(drop=True)

In [None]:
hinf.loc[hinf['kmer'] == 'TTATAA']

In [None]:
dir_in = 'Results'
sub_dir = 'kmer_counts'

In [None]:
    length = defaultdict(int)
    with open(filename, 'r') as fh:
        csv_reader = csv.reader(fh)
        for row in csv_reader:
            genus, gen_len = row[0], row[1]
            length[genus] = length.get(genus, 0) + int(gen_len)
    return length

In [None]:
filenames = glob.glob(f'{dir_in}/{sub_dir}/*/*_count_chr.csv')

In [None]:
filenames[0]

In [None]:
def get_counts_csv(filename):
    kmers = defaultdict(int)
    with open(filename) as fh:
        cvs_data = csv.reader(fh)
        for row in cvs_data:
            kmer, count = row[0], int(row[1])
            kmers[kmer] = kmers.get(kmer, 0) + count
    return kmers

In [None]:
data = get_counts_csv(filenames[0])

In [None]:
def write_csv(dir_out, sub_dir, name , data, kmin, kmax):
    csv_name = f'{name}_k{kmax}_chr.csv'
    full_path = os.path.join(dir_out, name, sub_dir)
    print(full_path)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    csv_data = open(f'{full_path}/{csv_name}', "w")
    writer = csv.writer(csv_data)
    for key, value in data.items():
        writer.writerow([key, value])
    csv_data.close()

In [None]:
 write_csv('Data', 'Test', 'Acidiphilium' , data, 6, 6)

### Checking palindromes

In [15]:

import glob
from itertools import product
import pandas as pd

In [19]:
def get_strand_complement(sequence):
    """Returns the complement strand of the genome.
     
     Inputs:
        sequence - string representing the sequence   

    Outputs:
    
        sequence - string representing the complement of 
                   the string.    
    """
    # make the sequence upper case
    seq = sequence.upper()
    # table to change the complement characters
    change = str.maketrans('ACGT', 'TGCA')
    return seq.translate(change)


def get_reverse_complement(sequence):
    """
    Returns the reverse complement strand of the genome.

    Inputs:

        sequence - string representing the sequence.

    Outputs:

        reversed_complement_sequence - string representing the reversed
                                       sequence complement.
    """
    return get_strand_complement(sequence)[::-1]

In [18]:
def is_palindrome(string):
    """
     Function to check if a strings is palindromic or not.

     Inputs:

         string - a string of characters (a word, kmer, n-gram...)

     Outputs:

        boolean value - True if the string is palindromic other wise False.

     """
    k, mid = len(string), len(string) // 2
    # checking even palindromes
    if k % 2 == 0:
        return string[:mid] == get_reverse_complement(string[mid:])
    # checking odd palindromes
    else:
        return string[:mid] == get_reverse_complement(string[mid + 1:])

In [None]:
# len([kmer for kmer in kmer_list_6 if is_palindrome(kmer)])

In [3]:
def get_palindromes(kmer_list, dataframe, pal=True):
    palindromes = [kmer for kmer in kmer_list if is_palindrome(kmer)]
    # selecting palindromes
    if pal:
        df = dataframe[dataframe['kmer'].isin(palindromes)]
        return df.reset_index(drop=True)
    else:
        df = dataframe[ ~ dataframe['kmer'].isin(palindromes)]
        return df.reset_index(drop=True)

In [49]:
dir_in = 'Results'
sub_dir = 'kmer_model'
k = 4
type_seq = 'chr'

glob.glob(f'{dir_in}/{sub_dir}/*/*_k{k}_{type_seq}.model.csv')

['Results/kmer_model/Methyloligella/Methyloligella_k4_chr.model.csv',
 'Results/kmer_model/Xenorhabdus/Xenorhabdus_k4_chr.model.csv',
 'Results/kmer_model/Undibacterium/Undibacterium_k4_chr.model.csv',
 'Results/kmer_model/Tenuifilum/Tenuifilum_k4_chr.model.csv',
 'Results/kmer_model/Dyadobacter/Dyadobacter_k4_chr.model.csv',
 'Results/kmer_model/Alloactinosynnema/Alloactinosynnema_k4_chr.model.csv',
 'Results/kmer_model/Parolsenella/Parolsenella_k4_chr.model.csv',
 'Results/kmer_model/Rhodobacter/Rhodobacter_k4_chr.model.csv',
 'Results/kmer_model/Limnobaculum/Limnobaculum_k4_chr.model.csv',
 'Results/kmer_model/Caldithrix/Caldithrix_k4_chr.model.csv',
 'Results/kmer_model/Thermanaerovibrio/Thermanaerovibrio_k4_chr.model.csv',
 'Results/kmer_model/Chromohalobacter/Chromohalobacter_k4_chr.model.csv',
 'Results/kmer_model/Croceibacter/Croceibacter_k4_chr.model.csv',
 'Results/kmer_model/Paenarthrobacter/Paenarthrobacter_k4_chr.model.csv',
 'Results/kmer_model/Herminiimonas/Herminiimonas

In [16]:
kmer_list_6 = get_all_possible_kmers('ACGT', 6, 6)

In [9]:
filename = 'Results/kmer_model/Acidiphilium/Acidiphilium_k6_chr.model.csv'

In [10]:
df = pd.read_csv(filename, header=0)

In [11]:
df

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value
0,CTCGAG,1100.0,2815,-32.336739,4.364793e-226,1.065623e-229
1,CCGCGG,4425.0,6377,-24.465818,6.995340e-129,1.707847e-132
2,GTCGAC,1583.0,2683,-21.244456,7.597629e-97,1.854890e-100
3,GGATCC,677.0,1404,-19.406011,1.406039e-80,3.432711e-84
4,CCCGGG,987.0,1797,-19.112624,4.060056e-78,9.912245e-82
...,...,...,...,...,...,...
4091,AGATAG,554.0,303,14.420185,7.911505e-44,1.931520e-47
4092,GCGCGG,9640.0,8308,14.630586,3.670788e-45,8.961884e-49
4093,ATCGAG,3040.0,2306,15.289973,1.826549e-49,4.459348e-53
4094,CCGCGC,10148.0,8712,15.403728,3.164143e-50,7.724959e-54


In [20]:
get_palindromes(kmer_list_6, df, pal=True)

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value
0,CTCGAG,1100.0,2815,-32.336739,4.364793e-226,1.065623e-229
1,CCGCGG,4425.0,6377,-24.465818,6.995340e-129,1.707847e-132
2,GTCGAC,1583.0,2683,-21.244456,7.597629e-97,1.854890e-100
3,GGATCC,677.0,1404,-19.406011,1.406039e-80,3.432711e-84
4,CCCGGG,987.0,1797,-19.112624,4.060056e-78,9.912245e-82
...,...,...,...,...,...,...
59,GTTAAC,72.0,59,1.692470,1.854596e+02,4.527821e-02
60,GAATTC,1187.0,1085,3.097072,4.002659e+00,9.772117e-04
61,TGCGCA,1600.0,1476,3.228259,2.550709e+00,6.227317e-04
62,TTGCAA,213.0,166,3.647992,5.412814e-01,1.321488e-04


In [33]:
pal = get_palindromes(kmer_list_6, df, pal=True)

In [34]:
pal

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value
0,CTCGAG,1100.0,2815,-32.336739,4.364793e-226,1.065623e-229
1,CCGCGG,4425.0,6377,-24.465818,6.995340e-129,1.707847e-132
2,GTCGAC,1583.0,2683,-21.244456,7.597629e-97,1.854890e-100
3,GGATCC,677.0,1404,-19.406011,1.406039e-80,3.432711e-84
4,CCCGGG,987.0,1797,-19.112624,4.060056e-78,9.912245e-82
...,...,...,...,...,...,...
59,GTTAAC,72.0,59,1.692470,1.854596e+02,4.527821e-02
60,GAATTC,1187.0,1085,3.097072,4.002659e+00,9.772117e-04
61,TGCGCA,1600.0,1476,3.228259,2.550709e+00,6.227317e-04
62,TTGCAA,213.0,166,3.647992,5.412814e-01,1.321488e-04


In [None]:
# def get_non_palindromic_kmers(kmer_list, dataframe):
#     palindromes = [kmer for kmer in kmer_list if is_palindrome(kmer)]
#     # selecting palindromes
#     return dataframe[ ~ dataframe['kmer'].isin(palindromes)]    

In [22]:
df

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value
0,CTCGAG,1100.0,2815,-32.336739,4.364793e-226,1.065623e-229
1,CCGCGG,4425.0,6377,-24.465818,6.995340e-129,1.707847e-132
2,GTCGAC,1583.0,2683,-21.244456,7.597629e-97,1.854890e-100
3,GGATCC,677.0,1404,-19.406011,1.406039e-80,3.432711e-84
4,CCCGGG,987.0,1797,-19.112624,4.060056e-78,9.912245e-82
...,...,...,...,...,...,...
4091,AGATAG,554.0,303,14.420185,7.911505e-44,1.931520e-47
4092,GCGCGG,9640.0,8308,14.630586,3.670788e-45,8.961884e-49
4093,ATCGAG,3040.0,2306,15.289973,1.826549e-49,4.459348e-53
4094,CCGCGC,10148.0,8712,15.403728,3.164143e-50,7.724959e-54


In [21]:
get_palindromes(kmer_list_6, df, False)

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value
0,CCGCTG,2434.0,3273,-14.671960,1.996561e-45,4.874416e-49
1,CAGCGG,2416.0,3218,-14.144160,4.155897e-42,1.014623e-45
2,GCGCCC,4046.0,5010,-13.628971,5.516305e-39,1.346754e-42
3,CTGTTG,323.0,639,-12.501886,1.493014e-32,3.645054e-36
4,CTGCGG,2230.0,2876,-12.050722,3.937437e-30,9.612883e-34
...,...,...,...,...,...,...
4027,AGATAG,554.0,303,14.420185,7.911505e-44,1.931520e-47
4028,GCGCGG,9640.0,8308,14.630586,3.670788e-45,8.961884e-49
4029,ATCGAG,3040.0,2306,15.289973,1.826549e-49,4.459348e-53
4030,CCGCGC,10148.0,8712,15.403728,3.164143e-50,7.724959e-54


In [23]:
4032 + 64

4096

In [None]:
# glob.glob(f'Results/kmer_splitted/*/*_4_{type_seq}.pal.csv')

In [None]:
# cut_enz['cut_len'] = cut_enz['kmer'].str.len()

In [None]:
# cut_enz = pd.DataFrame(cut_d.items(), columns=['kmer', 'enz_name'])

In [None]:
# cut_enz_grouped = cut_enz.groupby('cut_len')

In [None]:
# cut_enz_len6 = cut_enz_grouped.get_group(6)

In [None]:
df2 = pd.read_csv('enz_cut_sites.csv', header=None, names=['enz_name', 'kmer'])
df2['cut_len'] = df2['kmer'].str.len()
df2_grouped = df2.groupby('cut_len')
cut_enz_pal = df2_grouped.get_group(6)

In [None]:
# cut_enz_pal = df_grouped.get_group(8)
cut_enz_pal

In [29]:
def get_palindromes_cut_sites(filename, k):
    df = pd.read_csv(filename, header=None, names=['enz_name', 'kmer'])
    df['cut_len'] = df['kmer'].str.len()
    df_grouped = df.groupby('cut_len')
    cut_enz_pal = df_grouped.get_group(k)
    return cut_enz_pal.reset_index(drop=True).drop(columns=['cut_len'])

In [30]:
cut_sites = get_palindromes_cut_sites('Data/enz_cut_sites.csv', 6)

In [31]:
cut_sites

Unnamed: 0,enz_name,kmer
0,ApoI,AAATTC
1,ApoI,AAATTT
2,Sma325I,AACCCT
3,Rho5650I,AACGAG
4,AclI,AACGTT
...,...,...
669,AsuII,TTCGAA
670,Cco11366VI,TTCTTC
671,Fna13121I,TTGACC
672,Fna13121I,TTGATC


In [28]:
list(cut_sites['enz_name'].unique())

['ApoI',
 'Sma325I',
 'Rho5650I',
 'AclI',
 'Dpi3090II',
 'PaePA99III',
 'DrdVIII',
 'HindIII',
 'CstMI',
 'SspI',
 'Lde4408II',
 'RlaII',
 'SenSARA26III',
 'AflIII',
 'SurP32aII',
 'NspI',
 'BspLU11I',
 'TpyTP2I',
 'Nal45188II',
 'AspAMDIV',
 'EcoE1140I',
 'RdeGBII',
 'Nbr128II',
 'BetI',
 'Cfr10I',
 'AgeI',
 'BspMI',
 'MspI7II',
 'Lpn11417II',
 'Rsp008IV',
 'MluI',
 'SpeI',
 'BfiI',
 'Fba202Z8II',
 'XhoII',
 'BglII',
 'Lsp48III',
 'AchA6III',
 'Mlu211III',
 'HaeII',
 'Eco47III',
 'Bco11035III',
 'Lpl1004II',
 'Yru12986I',
 'Cba13II',
 'HaeI',
 'StuI',
 'Mba11I',
 'TatI',
 'ScaI',
 'Pin17FIII',
 'BanLI',
 'Cfa8380I',
 'ApyPI',
 'Kor51II',
 'ClaI',
 'BspHII',
 'Sen6480IV',
 'EcoNIH6II',
 'Sep11964I',
 'Gba708II',
 'AvaIII',
 'Bsp3004IV',
 'Rsp008V',
 'Cau10061II',
 'VspI',
 'ScoDS2II',
 'LsaDS4I',
 'Sag901I',
 'Tth111II',
 'Sma10259II',
 'BsbI',
 'MfeI',
 'WviI',
 'Rsp531II',
 'Pdu1735I',
 'TaqIII',
 'AbaPBA3II',
 'BsiI',
 'BsaAI',
 'BtrI',
 'PmaCI',
 'BtsI',
 'Esp3007I',
 'SdeAI',
 'E

In [35]:
pal_cut_sites = pal.merge(cut_sites, 
                            on='kmer', 
                            how = "right").dropna().reset_index(drop=True)

In [36]:
pal_cut_sites

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value,enz_name
0,AAATTT,223.0,293.0,-4.089612,8.849332e-02,2.160481e-05,ApoI
1,AACGTT,161.0,255.0,-5.886719,8.067862e-06,1.969693e-09,AclI
2,AAGCTT,204.0,530.0,-14.161587,3.243576e-42,7.918886e-46,HindIII
3,AATATT,232.0,228.0,0.264915,1.620122e+03,3.955375e-01,SspI
4,ACATGT,300.0,405.0,-5.217788,3.708366e-04,9.053629e-08,AflIII
...,...,...,...,...,...,...,...
112,TGTACA,50.0,54.0,-0.544335,1.200560e+03,2.931054e-01,Cin11811I
113,TGTACA,50.0,54.0,-0.544335,1.200560e+03,2.931054e-01,TatI
114,TTATAA,40.0,38.0,0.324445,1.526992e+03,3.728007e-01,PsiI
115,TTCGAA,613.0,748.0,-4.936606,1.628033e-03,3.974689e-07,AsuII


In [None]:
# pal_cut_sites.groupby('enz_name').groups

In [None]:
# pal_cut_sites.groupby('enz_name').agg(lambda x:x.value_counts())

In [None]:
# source.groupby(['Country','City'])['Short name'].agg(pd.Series.mode)

In [None]:
# enz_g = pal_cut_sites.groupby('enz_name')

In [None]:
# enz_g.count()

In [37]:
pal_cut_sites.groupby('enz_name')['kmer'].agg(pd.Series.mode)

enz_name
AatII               GACGTC
AccI      [GTATAC, GTCGAC]
AclI                AACGTT
AcyI      [GACGTC, GGCGCC]
AflII               CTTAAG
                ...       
VspI                ATTAAT
XbaI                TCTAGA
XhoI                CTCGAG
XhoII     [AGATCT, GGATCC]
XmaIII              CGGCCG
Name: kmer, Length: 86, dtype: object

In [38]:
pal_cut_sites.groupby('enz_name')[['kmer', 
                                  'observed',
                                  'expected',
                                  'z_score',
                                  'e_value',
                                  'p_value']].agg(pd.Series.mode)

Unnamed: 0_level_0,kmer,observed,expected,z_score,e_value,p_value
enz_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AatII,GACGTC,964,1103,-4.18595,0.058158,1.41987e-05
AccI,"[GTATAC, GTCGAC]","[58.0, 1583.0]","[84.0, 2683.0]","[-21.244455554043245, -2.836865954329168]","[7.597629312336298e-97, 9.330428881994084]","[1.854889968832104e-100, 0.002277936738768086]"
AclI,AACGTT,161,255,-5.88672,8.06786e-06,1.96969e-09
AcyI,"[GACGTC, GGCGCC]","[964.0, 7179.0]","[1103.0, 8095.0]","[-10.192478485189335, -4.185951119044572]","[4.3874630515681575e-21, 0.058157969599999436]","[1.0711579715742574e-24, 1.4198723046874862e-05]"
AflII,CTTAAG,42,37,0.821999,841.886,0.205539
...,...,...,...,...,...,...
VspI,ATTAAT,58,58,0,2048,0.5
XbaI,TCTAGA,35,38,-0.486667,1283.06,0.313247
XhoI,CTCGAG,1100,2815,-32.3367,4.36479e-226,1.06562e-229
XhoII,"[AGATCT, GGATCC]","[511.0, 677.0]","[813.0, 1404.0]","[-19.40601133608633, -10.592808805948033]","[1.4060385630742912e-80, 6.590231721962517e-23]","[3.4327113356305946e-84, 1.608943291494755e-26]"


In [47]:
pal_cut_sites[pal_cut_sites.enz_name == 'AccI']

Unnamed: 0,kmer,observed,expected,z_score,e_value,p_value,enz_name
85,GTATAC,58.0,84.0,-2.836866,9.330429,0.002277937,AccI
87,GTCGAC,1583.0,2683.0,-21.244456,7.597629e-97,1.8548899999999999e-100,AccI


In [None]:
pal_cut_sites.groupby('enz_name')['kmer'].agg(pd.Series.mode).to_frame()

In [None]:
# selecting rows based in a column value
pal_cut_sites.loc[pal_cut_sites['enz_name'] == 'ApoI']

In [None]:
# selecting rows based in a list or iterable
pal_cut_sites.loc[pal_cut_sites['enz_name'].isin(list(cut_sites['enz_name'].unique()))]

In [None]:
pal_cut_sites.info()

In [None]:
pal_cut_sites.shape[0], pal_cut_sites.size

In [None]:
len(pal_cut_sites['kmer'].unique())

In [None]:
# to get all the rows in dataframe(df1) which are not available in dataframe(df2)
# pal.merge(cut_enz_len6, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']

In [None]:
# selecting palindromes that are not restriction cut sites
pal_not_cut_site = pal.merge(
    pal_cut_sites, 
    how = 'outer', 
    indicator = True).loc[lambda x : x['_merge'] == 'left_only'].reset_index(drop=True)

In [None]:
pal_not_cut_site.drop(columns = ['enz_name', '_merge'], axis=1, inplace=True)

In [None]:
pal_not_cut_site

In [None]:
pal_not_cut_site.shape

In [None]:
list(pal_not_cut_site['kmer'].values)

In [None]:
set_pal = set(list(pal['kmer'].values))
set_cut = set(list(cut_enz_len6['kmer'].values))

In [None]:
len(set_pal.intersection(set_cut)), len(set_pal)

In [None]:
both = set_pal.intersection(set_cut)

In [None]:
not_pal = list(set_pal.difference(both))

In [None]:
len(not_pal)

In [None]:
# selecting non-cute sites palindromes
pal[pal['kmer'].isin(list(pal_not_cut_site['kmer'].values))]

In [None]:
# rows common for both datframes
pal_cut_enz = pal.merge(cut_enz_len6, how = 'inner', indicator=False)

In [None]:
pal_cut_enz.shape