In [1]:
import os
import re
import operator
import gzip
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from fasta_parser import parse_fasta, fasta_reader
from system_utils import get_files2, data_generator, get_all_fasta
from sequence_utils import get_all_possible_kmers
from markov_models import get_expected_higher_markov, get_variance, get_standard_deviation
from markov_models import z_scores, get_p_values, get_e_values, gets_selected_kmers
from alphabet import iupac_dna

In [2]:
list_file_paths = get_files2('Data/Genomes_splitted', 'Chromosomes')

KeyboardInterrupt: 

In [None]:
fasta_paths = get_all_fasta('Data/Genomes_splitted', 'Chromosomes', 'gz')

In [None]:
def get_mean_sequence_length(fasta_dict):
    """
    Calculates the sequence length from all fasta files.
    
    Inputs:
        fasta_dict - a dictionary-like object mapping the bacterial genus to
                     the complete pathway to the fasta files.
    
    Outputs:
        fasta_length - a dictionary-like object mapping the bacterial genus to
                       the mean of all genomes lengths.        
    """
    # initialyze the container
    dict_len = defaultdict(int)
    # get the genus names and the files pathways
    for name, filename in data_generator(fasta_dict):
        # add the names to the container
        dict_len[name] = dict_len.get(name, [])
        # uterates throung the fasta files
        for _, seq in parse_fasta(filename):
            # aads the genome length to a list
            dict_len[name] += [len(seq)]
    # get the mean of all genomes from a genus genomes
    return {k: sum(cnt)//len(cnt) for k, cnt in dict_len.items()}   

In [None]:
# seq_len_dict = get_mean_sequence_length(fasta_paths)

In [None]:
def save_chr_lengths(seq_len_dict):
    for name in seq_len_dict:
        path = os.path.join('Results', 'Lengths')
        csv_name = f'chr_lengths.csv'
        if not os.path.exists(path):
            os.makedirs(path)
        df = pd.DataFrame(seq_len_dict.items(), columns=['name', 'length'])
        df.to_csv(f'{path}/{csv_name}', header=False, index=False)

In [None]:
save_chr_lengths(seq_len_dict)

In [2]:
def get_len_csv(path):
    dict_len = defaultdict(int)
    with open(path, 'r') as fh:
        for line in fh:
            name, length = line.strip('\n').split(',')
            dict_len[name] = dict_len.get(name, 0) + int(length)
    return dict_len

In [3]:
def get_paths_to_csv_counts(dir_name, sub_dir, sub_sub_dir, names_list):
    """
    Function to get the full paths to kmer count csv files.
    
    Inputs:
        dir_name - string representing the root directory.
        sub_dir - string representing a sub directory in the root directory.
        sub_sub_dir - string representing a sub directory in the root directory.
        names_list - list-like obeject with all genus/species names.
        
    Outputs:
        csv_dict - a dictionary like object mapping the genus/species names to
                   the full path to the csv files.
    ex: get_count_csvs('Results/Kmers_from_splitted', 'Chromosomes', 'kmers', ['Acidiphilium'])
        {'Acidiphilium': 
        'Results/Kmers_from_splitted/Acidiphilium/Chromosomes/kmers/Acidiphilium_Chromosomes10.csv.gz'}
    """
    # initialize the container
    csv_dict = {}
    # iterates through names
    for name in names_list:
        # join the path
        # Results/Kmer_splitted/genus/Chromossome/kmers
        paths = os.path.join(dir_name, name, sub_dir, sub_sub_dir)
        # get the filenames
        filenames = ''.join(os.listdir(paths))
        # creates the ful path to the csv file
        full_paths = os.path.join(paths,filenames)
        # add the name and full path to the container
        csv_dict[name] = csv_dict.get(name, full_paths)
    return csv_dict

In [4]:
seq_len_dict = get_len_csv('Results/Lengths/chr_lengths.csv')

In [5]:
 get_paths_to_csv_counts('Results/Kmers_from_splitted', 'Chromosomes', 'kmers', ['Acidiphilium'])

{'Acidiphilium': 'Results/Kmers_from_splitted/Acidiphilium/Chromosomes/kmers/Acidiphilium_Chromosomes10.csv.gz'}

In [6]:
 names = seq_len_dict.keys()

In [7]:
csv_files = get_paths_to_csv_counts('Results/Kmers_from_splitted', 'Chromosomes', 'kmers', names)

In [8]:
csv_files['Acidiphilium']

'Results/Kmers_from_splitted/Acidiphilium/Chromosomes/kmers/Acidiphilium_Chromosomes10.csv.gz'

In [9]:
def get_kmer_count_from_csv(filename):
    """
    Gets the count of all kmers from a csv file.
    
    Inputs:
        filename - a string representing a complete pathway to the csv file.
        
    Outputs:
        kmer_counts - a dictionary-like object mapping the kmers ( a substring of length k)
                      to a integer representing the number of time the kmer was counted
                      in a string.
    """
    # initialyze the dicitonary/container
    kmer_counts = dict()
    # check if the file is compressed
    name, extension = os.path.splitext(filename)
    # if file compressed
    if extension == '.gz':
        opener = gzip.open(filename, 'rt')
    # if not compressed
    else:
        opener = open(filename, 'r')
    # open the compressed csv file
    with opener as file:
        # skip the header
        header = file.readline()
        # iterates through the lines
        for line in file:
            # strip spaces and split the lines in the delimiter
            (key, val) = line.strip('\n').split(",")
            # add the key and the val to the dicitonary
            kmer_counts[key] = int(val)
    return kmer_counts

In [14]:
csv_files['Haemophilus']

'Results/Kmers_from_splitted/Haemophilus/Chromosomes/kmers/Haemophilus_Chromosomes10.csv.gz'

In [11]:
kmer_list = get_all_possible_kmers(iupac_dna, 6, 6)

In [30]:
def get_kmer_stats(seq_len_dict, 
                   csv_files, 
                   kmer_list,
                   kmax, 
                   eval_cutoff):
    names = sorted(seq_len_dict.keys())
    for name in names:
        kmer_counts = get_kmer_count_from_csv(csv_files[name])
        seq_len = int(seq_len_dict[name])
        expected_mers = get_expected_higher_markov(kmer_list, kmer_counts)
        variance = get_variance(kmer_list, seq_len, expected_mers)
        std = get_standard_deviation(variance)
        z_scr = z_scores(expected_mers, kmer_counts, std)
        p_val = get_p_values(z_scr)
        e_val = get_e_values(kmer_list, p_val)
        df_selected, df_to_check = gets_selected_kmers(kmer_list,
                                                      kmer_counts,
                                                      expected_mers,
                                                      z_scr,
                                                      p_val,
                                                      e_val,
                                                      eval_cutoff)

In [31]:
df_selected, df_to_check = get_kmer_stats(seq_len_dict, csv_files, kmer_list, 4, 0.01)

KeyboardInterrupt: 

In [29]:
df_selected

Unnamed: 0,kmer,count,expected,z_score,e_value,p_value
0,CTCGAG,1085,2788,-32.265466,1.067537e-228,4.372632e-225
1,CCGCGG,4372,6312,-24.440078,3.208191e-132,1.314075e-128
2,GTCGAC,1561,2651,-21.177904,7.633817e-100,3.126811e-96
3,GGATCC,665,1380,-19.250875,6.939963e-83,2.842609e-79
4,CCCGGG,969,1770,-19.043799,3.698146e-81,1.514761e-77
...,...,...,...,...,...,...
615,AGATAG,543,295,14.439715,1.455216e-47,5.960565e-44
616,GCGCGG,9543,8212,14.704618,3.010472e-49,1.233089e-45
617,ATCGAG,2999,2276,15.159702,3.268238e-52,1.338670e-48
618,CCGCGC,10055,8636,15.288050,4.592955e-53,1.881275e-49


In [36]:
def save_data_frame(df, dir_out, name, sub_dir, sub_sub_dir, kmax, df_type):
    path = os.path.join(dir_out, name, sub_dir, sub_sub_dir)
    csv_name = f'{name}_{df_type}_{kmax}.csv'
    if not os.path.exists(path):
        os.makedirs(path)
    df.to_csv(f'{path}/{csv_name}', header=False, index=False)

In [37]:
save_data_frame(df_selected, dir_out='Results/Kmer_statistics', name='Avocado', sub_dir='Chromossome', sub_sub_dir='kmers', kmax=6, df_type= 'selected')

In [None]:
names

In [None]:
kmer_counts = get_kmer_count_from_csv(filename)

In [None]:
kmer_list