In [2]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import Bio
from Bio import SeqIO,AlignIO
from sklearn.feature_extraction.text import CountVectorizer

# Read in files, generate k-mer table, calculate column interval 

In [105]:
def get_kmer_table(genes,gene_len):
    
    count_vect = CountVectorizer(analyzer='char',ngram_range=(2,5))
    X = count_vect.fit_transform(genes)
    chars = count_vect.get_feature_names()
    chars
    kmers = X.toarray()
    kmer_freq = []
    for i in range(len(genes)):
        kmer_freq.append(kmers[i]/gene_len[i])
    input = pd.DataFrame(kmer_freq,columns=chars)
    return input

def get_ids(filename):
    ids = []
    for record in SeqIO.parse(filename, "fasta"):
        ids.append(record.id)

def get_gene_sequences(filename):
    genes = []
    for record in SeqIO.parse(filename, "fasta"):
        genes.append(str(record.seq))
    return genes

# genes: a list of gene sequences, which can directly be generated from get_gene_sequences().
def get_gene_len(genes):
    gene_len = []
    
    for i in range(len(genes)):
        gene_len.append(len(genes[i]))
    return gene_len
    
#data: a dataframe with kmer frequencies as columns
def get_interval(data):
    max = []
    min = []
    mean = []
    
    for column in data:
        columnSeriesObj = data[column]
        max.append(columnSeriesObj.max())
        min.append(columnSeriesObj.min())
        mean.append(columnSeriesObj.mean())
        
    interval = pd.DataFrame({'max': max, 'min': min, 'mean' : mean},index=data.columns)
    return interval

In [61]:
genes_0 = get_gene_sequences("label0.fasta")
genes_1 = get_gene_sequences("label1.fasta")
gene_len_0 = get_gene_len(genes_0)
gene_len_1 = get_gene_len(genes_1)
all_genes = genes_0 + genes_1
all_gene_len = gene_len_0 + gene_len_1
kmer_table = get_kmer_table(all_genes,all_gene_len)
kmer_table

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaac,aaaag,aaaat,aaac,aaaca,aaacc,...,ysc,yscc,ysccg,yt,ytc,ytca,ytcag,ytt,ytta,yttac
0,0.077253,0.024594,0.007960,0.002347,0.001735,0.001735,0.002143,0.005409,0.001531,0.000918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.029176,0.005106,0.000486,0.000000,0.000243,0.000000,0.000243,0.001702,0.000243,0.000243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.068164,0.016962,0.004241,0.001649,0.001021,0.000707,0.000785,0.004476,0.000628,0.001492,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.074582,0.023697,0.004989,0.000748,0.001746,0.000748,0.001746,0.004989,0.001247,0.000499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.087324,0.024485,0.007151,0.001517,0.001733,0.002384,0.001517,0.006284,0.001083,0.001300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,0.072492,0.018723,0.005349,0.002400,0.001029,0.000892,0.000960,0.004321,0.001372,0.001234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353,0.086561,0.021344,0.005534,0.001186,0.001976,0.001186,0.001186,0.004743,0.001581,0.001186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
354,0.074803,0.018275,0.005111,0.001084,0.001704,0.000929,0.001394,0.004336,0.001704,0.001084,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
355,0.077117,0.017770,0.004694,0.001509,0.001509,0.000671,0.001006,0.005197,0.002012,0.001509,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
interval_0 = get_interval(kmer_table[0:len(genes_0)])
interval_0

Unnamed: 0,max,min,mean
aa,0.159256,0.023429,7.189462e-02
aaa,0.085508,0.003405,2.029994e-02
aaaa,0.069961,0.000000,6.320738e-03
aaaaa,0.062188,0.000000,2.424284e-03
aaaac,0.005023,0.000000,1.193453e-03
...,...,...,...
ytca,0.000076,0.000000,2.171741e-07
ytcag,0.000076,0.000000,2.171741e-07
ytt,0.000387,0.000000,1.104423e-06
ytta,0.000387,0.000000,1.104423e-06


In [116]:
interval_1 = get_interval(kmer_table[len(genes_0)+1:len(all_genes)])
interval_1

Unnamed: 0,max,min,mean
aa,0.102380,0.072492,0.085402
aaa,0.034585,0.017770,0.023499
aaaa,0.011496,0.004694,0.006897
aaaaa,0.005173,0.001084,0.002309
aaaac,0.002400,0.001029,0.001803
...,...,...,...
ytca,0.000000,0.000000,0.000000
ytcag,0.000000,0.000000,0.000000
ytt,0.000000,0.000000,0.000000
ytta,0.000000,0.000000,0.000000


#### notice some intervals are [0,0] in label1, but non-zero in label0

## Check if any interval overlaps:

In [108]:
def check_overlap(interval_0,interval_1):
    list = []
    max_0 = interval_0['max']
    min_0 = interval_0['min']
    max_1 = interval_1['max']
    min_1 = interval_1['min']
    
    for i in range(len(max_0)):
        if(min_0[i] >= max_1[i] or min_1[i] >= max_0[i]):
            list.append(interval_0.index.values[i])
    
    return list

K-mers that have distinct intervals:

In [109]:
chars_distinct = check_overlap(interval_0, interval_1)
chars_distinct # 456 k-mers

['aaan',
 'aaann',
 'aaar',
 'aaarg',
 'aack',
 'aackc',
 'aacn',
 'aacng',
 'aan',
 'aann',
 'aannt',
 'aar',
 'aarg',
 'aargc',
 'aay',
 'aayc',
 'aaycg',
 'aays',
 'aaysc',
 'acak',
 'acakg',
 'ack',
 'ackc',
 'ackct',
 'acn',
 'acng',
 'acngg',
 'acr',
 'acra',
 'acraa',
 'acttr',
 'agatn',
 'agaw',
 'agawg',
 'agcgk',
 'aggr',
 'aggra',
 'agr',
 'agrs',
 'agrss',
 'agrt',
 'agrtc',
 'agtgr',
 'agtk',
 'agtkg',
 'ak',
 'akg',
 'akgg',
 'akggt',
 'an',
 'ana',
 'anaa',
 'anaaa',
 'ann',
 'annt',
 'anntg',
 'ar',
 'ara',
 'arag',
 'araga',
 'arg',
 'argc',
 'argct',
 'atatr',
 'ataty',
 'atcgr',
 'atn',
 'atna',
 'atnaa',
 'atnn',
 'atnnc',
 'atr',
 'atra',
 'atrat',
 'atttw',
 'aty',
 'atyt',
 'atytt',
 'aw',
 'awg',
 'awga',
 'awgaa',
 'ay',
 'ayc',
 'aycg',
 'aycgt',
 'ayg',
 'ayga',
 'aygac',
 'ays',
 'aysc',
 'ayscc',
 'caay',
 'caayc',
 'cacr',
 'cacra',
 'caggr',
 'cagr',
 'cagrt',
 'cak',
 'cakg',
 'cakgg',
 'can',
 'cana',
 'canaa',
 'car',
 'cara',
 'carag',
 'catn',
 'catn

In [120]:
# save the distinct intervals for future use
distinct_interval_0 = interval_0.loc[chars_distinct, : ]
distinct_interval_0

Unnamed: 0,max,min,mean
aaan,0.000052,0.0,1.480922e-07
aaann,0.000052,0.0,1.480922e-07
aaar,0.000104,0.0,2.964149e-07
aaarg,0.000104,0.0,2.964149e-07
aack,0.000349,0.0,9.958672e-07
...,...,...,...
ytca,0.000076,0.0,2.171741e-07
ytcag,0.000076,0.0,2.171741e-07
ytt,0.000387,0.0,1.104423e-06
ytta,0.000387,0.0,1.104423e-06


In [121]:
distinct_interval_1 = interval_1.loc[chars_distinct, : ]
distinct_interval_1

Unnamed: 0,max,min,mean
aaan,0.0,0.0,0.0
aaann,0.0,0.0,0.0
aaar,0.0,0.0,0.0
aaarg,0.0,0.0,0.0
aack,0.0,0.0,0.0
...,...,...,...
ytca,0.0,0.0,0.0
ytcag,0.0,0.0,0.0
ytt,0.0,0.0,0.0
ytta,0.0,0.0,0.0


In [110]:
# select the overlapping interval
overlap_interval_0 = interval_0.drop(chars_distinct,axis = 0)

In [111]:
overlap_interval_0

Unnamed: 0,max,min,mean
aa,0.159256,0.023429,0.071895
aaa,0.085508,0.003405,0.020300
aaaa,0.069961,0.000000,0.006321
aaaaa,0.062188,0.000000,0.002424
aaaac,0.005023,0.000000,0.001193
...,...,...,...
tttt,0.017760,0.000000,0.004134
tttta,0.005267,0.000000,0.000985
ttttc,0.004692,0.000000,0.001076
ttttg,0.008197,0.000000,0.001223


In [113]:
overlap_interval_1 = interval_1.drop(chars_distinct,axis = 0)
overlap_interval_1

Unnamed: 0,max,min,mean
aa,0.102380,0.072492,0.085402
aaa,0.034585,0.017770,0.023499
aaaa,0.011496,0.004694,0.006897
aaaaa,0.005173,0.001084,0.002309
aaaac,0.002400,0.001029,0.001803
...,...,...,...
tttt,0.006994,0.003407,0.005131
tttta,0.003162,0.000465,0.001288
ttttc,0.002012,0.000791,0.001398
ttttg,0.002299,0.000395,0.001295


# Algorithm to solve overlapping (Incomplete)

In [None]:
def gen_dist_interval(interval_0, interval_1):
    max_0 = interval_0['max']
    min_0 = interval_0['min']
    max_1 = interval_1['max']
    min_1 = interval_1['min']
    
    if(min_0 >= max_1 or min_1 >= max_0):
        return 
    elseif(min_0 < min_1 and max_1<max_0):
        

In [None]:
for i in range(len(overlap_interval_0)):
    gen_dist_interval(overlap_interval_0[i],overlap_interval_1[i])

# Generate virual samples from distinct intervals

### First try: create 100 samples

In [132]:
def listmaker(x,n):
    lis = [x] * n
    return lis

def sample_generator(distinct_interval):
    samples = []
    for i in range(len(distinct_interval)):
        mu = distinct_interval['min'][i] + (distinct_interval['max'][i] - distinct_interval['min'][i])/2
        sigma = (distinct_interval['max'][i] - distinct_interval['min'][i])/6
        if (distinct_interval['max'][i] - distinct_interval['min'][i] == 0):
            samples.append(listmaker(distinct_interval['max'][i],100))
        else:
            samples.append(np.random.normal(mu, sigma, 100))
        
    return samples

data2_0 = sample_generator(distinct_interval_0)

In [133]:
len(data2_0)

456

In [134]:
len(data2_0[1])

100

# Model & Test

# Next steps:

1. Complete the algorithm for solving overlapping intervals
2. Build a preliminary model(linear regression,etc) so that we can test with or without virual samples.
3. Improve the method for generating new samples.
4. Add some data visualization plots to help understanding.