# Create k-mer matrix

This notebook creates the k-mer count matrix for 
N. gonorrhoeae
Azithromycin

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import pylab as plt
import glob
import os

from scipy.sparse import coo_matrix

### Some useful functions

In [4]:
def load_df(data_dir):
    filenames = sorted(glob.glob(os.path.join(data_dir, '*.txt')))    
    print('Found %d input files' % (len(filenames)))
    
    if len(filenames) > 0:    
        dfs = []
        for i in range(len(filenames)):
            filename = filenames[i]

            # load the output from dsk2ascii
            print('%d/%d %s' % (i+1, len(filenames), filename))
            df = pd.read_csv(filename, header=None, sep=' ')
            df = df.rename(columns={0: 'kmer', 1: 'count'})

            # get strain information from filename
            basename = os.path.basename(filename)
            strain = '.'.join(basename.split('.')[0:-1])
            df['strain'] = strain

            # reorder columns
            cols = ['strain', 'kmer', 'count']
            df = df[cols]

            dfs.append(df)

        df = pd.concat(dfs)

        unique_kmer_count = len(df['kmer'].unique())    
        unique_strain_count = len(df['strain'].unique())
        print('Found %d unique kmers in %d strains (nnz=%d)' % (unique_kmer_count, unique_strain_count, df.shape[0]))
        return df

### Load k-mers from DSK output

Point this to the folder containing all the .txt files

In [5]:
data_dir = '/home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid'

In [6]:
df = load_df(data_dir)

Found 1428 input files
1/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000304555.1.txt
2/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649215.1.txt
3/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649235.1.txt
4/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649255.1.txt
5/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649275.1.txt
6/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649295.1.txt
7/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649315.1.txt
8/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649335.1.txt
9/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649355.1.txt
10/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649375.1.txt
11/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649395.1.txt
12/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000649

111/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000651935.1.txt
112/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000651955.1.txt
113/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000651975.1.txt
114/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000651995.1.txt
115/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652015.1.txt
116/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652035.1.txt
117/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652055.1.txt
118/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652075.1.txt
119/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652095.1.txt
120/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652115.1.txt
121/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000652135.1.txt
122/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00065215

218/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654455.1.txt
219/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654475.1.txt
220/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654495.1.txt
221/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654515.1.txt
222/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654535.1.txt
223/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654555.1.txt
224/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654575.1.txt
225/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654595.1.txt
226/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654615.1.txt
227/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654635.1.txt
228/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000654655.1.txt
229/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00065467

319/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656575.1.txt
320/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656595.1.txt
321/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656615.1.txt
322/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656635.1.txt
323/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656655.1.txt
324/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656675.1.txt
325/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656695.1.txt
326/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656715.1.txt
327/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656735.1.txt
328/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656755.1.txt
329/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000656775.1.txt
330/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00065679

420/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659045.1.txt
421/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659085.1.txt
422/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659105.1.txt
423/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659125.1.txt
424/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659145.1.txt
425/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659165.1.txt
426/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659185.1.txt
427/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659205.1.txt
428/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659225.1.txt
429/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659245.1.txt
430/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000659265.1.txt
431/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00065928

528/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661525.1.txt
529/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661555.1.txt
530/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661575.1.txt
531/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661595.1.txt
532/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661615.1.txt
533/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661635.1.txt
534/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661655.1.txt
535/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661675.1.txt
536/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661695.1.txt
537/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661715.1.txt
538/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000661735.1.txt
539/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00066175

626/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664085.1.txt
627/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664105.1.txt
628/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664125.1.txt
629/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664145.1.txt
630/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664165.1.txt
631/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664185.1.txt
632/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664205.1.txt
633/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664225.1.txt
634/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664245.1.txt
635/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664265.1.txt
636/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000664285.1.txt
637/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00066430

724/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666665.1.txt
725/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666785.1.txt
726/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666805.1.txt
727/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666885.1.txt
728/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666905.1.txt
729/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666945.1.txt
730/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000666965.1.txt
731/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000667025.1.txt
732/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000667045.1.txt
733/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000667065.1.txt
734/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000667085.1.txt
735/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00066710

827/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669155.1.txt
828/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669175.1.txt
829/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669195.1.txt
830/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669215.1.txt
831/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669235.1.txt
832/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669255.1.txt
833/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669295.1.txt
834/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669315.1.txt
835/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669335.1.txt
836/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669375.1.txt
837/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000669395.1.txt
838/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00066941

935/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000671915.1.txt
936/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000671935.1.txt
937/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000671955.1.txt
938/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000671975.1.txt
939/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000671995.1.txt
940/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000672015.1.txt
941/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000672035.1.txt
942/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000672055.1.txt
943/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000672075.1.txt
944/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000672095.1.txt
945/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000672115.1.txt
946/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/00067213

1033/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674035.1.txt
1034/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674055.1.txt
1035/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674075.1.txt
1036/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674095.1.txt
1037/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674115.1.txt
1038/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674135.1.txt
1039/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674155.1.txt
1040/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674175.1.txt
1041/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674195.1.txt
1042/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674215.1.txt
1043/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000674235.1.txt
1044/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isonia

1130/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676135.1.txt
1131/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676155.1.txt
1132/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676175.1.txt
1133/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676195.1.txt
1134/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676215.1.txt
1135/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676235.1.txt
1136/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676255.1.txt
1137/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676275.1.txt
1138/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676295.1.txt
1139/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676315.1.txt
1140/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000676335.1.txt
1141/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isonia

1228/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678155.1.txt
1229/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678175.1.txt
1230/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678195.1.txt
1231/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678215.1.txt
1232/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678235.1.txt
1233/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678255.1.txt
1234/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678275.1.txt
1235/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678295.1.txt
1236/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678315.1.txt
1237/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678335.1.txt
1238/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000678355.1.txt
1239/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isonia

1332/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680315.1.txt
1333/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680335.1.txt
1334/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680355.1.txt
1335/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680375.1.txt
1336/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680395.1.txt
1337/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680415.1.txt
1338/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680435.1.txt
1339/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680455.1.txt
1340/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680475.1.txt
1341/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680495.1.txt
1342/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isoniazid/000680515.1.txt
1343/1428 /home/joewandy/Downloads/kmersDataTB/kmersDataTB/isonia

Found 2398119 unique kmers in 1428 strains (nnz=50537573)


### Construct a feature matrix

- use coo matrix, pass it a list of rows, columns, values
- filter kmer to appear at least N times in a strain and M times across all strains, e.g. N = 1, M = 20
- go through fasta files, find start and end position of kmers
- load bgc clustering, find start and end of bgc
- compare the overlap, vicinity overlap

do the following drugs:
- isoniazid (INH)
- rifampin (RIF)
- ethambutol (EMB)
- pyrazinamide (PZA)

Using COO Matrix

In [10]:
# extracts unique column values from a dataframe and gives it an index
def get_unique(df, col):
    unique_list = df[col].unique()
    unique_dict = {k: v for v, k in enumerate(unique_list)}
    return unique_list, unique_dict

In [49]:
def get_feature_matrix(df):
    # get unique list of strains/kmers and their indices as dictionary
    unique_strains, strain_dict = get_unique(df, 'strain')
    unique_kmers, kmer_dict = get_unique(df, 'kmer')
    
    # convert entire data to indices
    strain_idx = list(map(lambda x: strain_dict[x], df['strain']))
    kmer_idx = list(map(lambda x: kmer_dict[x], df['kmer']))
    data = df['count'].values # count of kmer in strain
    
    # create sparse matrix
    mat = coo_matrix((data, (strain_idx, kmer_idx)), shape=(len(unique_strains), len(unique_kmers)))
    mat = mat.tocsr()
    nnz_count = mat.count_nonzero()
    sparsity = nnz_count / (mat.shape[0] * mat.shape[1])
    print('nnz_count=%d sparsity=%.4f' % (nnz_count, sparsity))
    return mat.tocsr(), unique_strains, unique_kmers

In [50]:
mat, strains, kmers = get_feature_matrix(df)

nnz_count=50537573 sparsity=0.0148


### Load target prediction

In [None]:
predicted_df = pd.read_csv('/home/joewandy/git/amr/TBmetadata/isoniazid.csv', sep=',', header=None)
predicted_df = predicted_df.rename(columns={0: 'strain', 1: 'resistance'})
predicted_df.set_index('strain')

### Save the extracted features

In [None]:
feature_df.to_pickle('/home/joewandy/Downloads/gonorrhoeae_feature_df.zip')

In [None]:
predicted_df.to_pickle('/home/joewandy/Downloads/gonorrhoeae_predicted_df.zip')

In [None]:
import pandas as pd

feature_df = pd.read_pickle('/home/joewandy/Downloads/gonorrhoeae_feature_df.zip')
predicted_df = pd.read_pickle('/home/joewandy/Downloads/gonorrhoeae_predicted_df.zip')

feature_df.shape, predicted_df.shape