In [1]:
import itertools
import os
import numpy as np
import sys
from pprint import pprint
import pandas as pd
import random
from collections import Counter
from Bio import SeqIO

In [1]:
def get_sequences(input_file):
    (seq, sequences_list)=('',[])
    for line in open(input_file):
        if line.startswith('>'):
            sequences_list.append(seq)
            seq = ''
        else:
            seq+= line.rstrip()
    sequences_list.append(seq)
    del sequences_list[0]
    return sequences_list

def get_motifs(length,sequences_list):
    (d,index) = ({}, 0)
    for seq in sequences_list:
        for i in range(0, len(seq)-length+1):
            word = seq[i:i+length]
            if word not in d:
                d[word] = index
                index += 1 
    return d

def get_headers(input_file):
    list_of_ids=[]
    for line in open(input_file):
        if line.startswith('>'):
            line = line.replace('>','').split()
            list_of_ids.append(line[0])
    return list_of_ids

def calculate_occurrences(length, input_file):
    df = pd.read_csv(input_file)
    sequences_list = df["Sequence"]  #get_sequences(input_file)
    d = get_motifs(length,sequences_list)
    print("Done with motif finding")   
    rows_num = len(sequences_list)
    cols_num = len(d)
    data = np.zeros(shape=(rows_num,cols_num))
    for row_idx, seq in enumerate(sequences_list):
        for i in range(0, len(seq)-length+1):
            word = seq[i:i+length]
            col_idx = d[word]
            data[row_idx, col_idx] += 1
    return data

def calculate_frequencies(occurrences_list,seqs_number):
    frequencies_list =[]
    for i in range(0,seqs_number):
        frequencies_list.append(occurrences_list[i,:]/np.sum(occurrences_list[i,:]))
    return np.vstack(frequencies_list)

In [7]:
def minkowski(list_,seqs_number,exponent): 
    matrix = np.zeros([seqs_number, seqs_number])
    for i, j in itertools.combinations(range(0,seqs_number),2):
         matrix[i][j]= matrix [j][i] = np.linalg.norm((list_[i,:]-list_[j,:]),ord=exponent)
        #  (np.sum((np.absolute(list_[i,:] - list_[j,:]))**exponent))**(1.0/float(exponent))
    return matrix
def euclidean(list_,seqs_number):
    return minkowski(list_,seqs_number,2)

In [10]:
files = os.listdir('All_Countries_Splitted')
main_dir = 'All_Countries_Splitted'
if not os.path.exists("All_Countries_Spectral_Vects"):
    os.mkdir("All_Countries_Spectral_Vects")
if not os.path.exists("All_Countries_Distance_Matrix"):
    os.mkdir("All_Countries_Distance_Matrix")

# NFV_Vects = os.listdir("All_Countries_NFV_Vects")
# contries_ = []
# for fl in NFV_Vects:
#     contries_.append(fl.split("_")[0])

for file_ in files:
    
    inp_file = main_dir + "/"+file_
    c_name = file_.split(".")[0]
    # if(c_name in contries_):
    #     continue
    df = pd.read_csv(inp_file)
    
    
    print("Started working with ->",file_)
    sequences = df["Sequence"]
    print("n_seq->",sequences.shape)
    final_list = []
    count = 0
    # for seq in sequences:
    final_list = calculate_occurrences(length=3,inp_file)
    #     n_seq = encode()
    # for seq in sequences:
    #     Fast_vector = get_NFV(seq)
    #     final_list.append(Fast_vector)
    #     if(count%100 == 0):
    #         print(count)
    #     count +=1
        # print(Fast_vector)
    
    acc_vects = cont_ = np.array(final_list)
    print(acc_vects.shape)
    # cont_ = np.array(acc_vects)
    # print(cont_.shape)
    ID_arr = df["Accession ID"]
    # print(ID_arr.shape)
    ID_col = pd.Series(ID_arr)
    Vector_col = pd.Series(cont_.tolist())
    frame = {'Accession ID': ID_col,'Spectral Vector':Vector_col}
    df_final = pd.DataFrame(frame)
    direc_1 = "All_Countries_Spectral_Vects"
    df_final.to_csv(direc_1+"/"+c_name+"_Euclidean.csv",index=False)
    direc_2 = "All_Countries_Distance_Matrix"
    matrix = euclidean(cont_,len(cont_))
    print(matrix.shape)
    final_df = pd.DataFrame(matrix,columns=ID_arr)
    final_df.to_csv(direc_2+"/"+c_name+"_accumulated_distance_matrix.csv",index=False)


Started working with -> Finland.csv
n_seq-> (40,)
0
(40, 18)
(40, 40)
Started working with -> Luxembourg.csv
n_seq-> (86,)
0
(86, 18)
(86, 86)
Started working with -> Algeria.csv
n_seq-> (3,)
0
(3, 18)
(3, 3)
Started working with -> Argentina.csv
n_seq-> (3,)
0
(3, 18)
(3, 3)
Started working with -> Australia.csv
n_seq-> (1045,)
0
100
200
300
400
500
600
700
800
900
1000
(1045, 18)
(1045, 1045)
Started working with -> Austria.csv
n_seq-> (21,)
0
(21, 18)
(21, 21)
Started working with -> Bangladesh.csv
n_seq-> (7,)
0
(7, 18)
(7, 7)
Started working with -> Belarus.csv
n_seq-> (2,)
0
(2, 18)
(2, 2)
Started working with -> Belgium.csv
n_seq-> (386,)
0
100
200
300
(386, 18)
(386, 386)
Started working with -> Brazil.csv
n_seq-> (34,)
0
(34, 18)
(34, 34)
Started working with -> Cambodia.csv
n_seq-> (1,)
0
(1, 18)
(1, 1)
Started working with -> Canada.csv
n_seq-> (130,)
0
100
(130, 18)
(130, 130)
Started working with -> Chile.csv
n_seq-> (7,)
0
(7, 18)
(7, 7)
Started working with -> China.csv


In [28]:
inp_file = "Others_Af_CA_SA.csv" #Test_gisaid_LS.csv/Train_gisaid_LS.csv
# length = sys.argv[2]
df = pd.read_csv(inp_file)


sequences = df["Sequence"]
# seq = sequences[0]
c_name = "Others"

In [31]:
cont_ = np.array(final_list)
print(cont_.shape)
ID_arr = df["Accession ID"]
print(ID_arr.shape)

ID_col = pd.Series(ID_arr)
Vector_col = pd.Series(cont_.tolist())

frame = {'Accession ID': ID_col,'Fast Vector':Vector_col}
df_final = pd.DataFrame(frame)

df_final.to_csv(c_name+"_Fast_vector.csv",index=False)

(137, 18)
(137,)


In [19]:
def minkowski(list_,seqs_number,exponent): 
    matrix = np.zeros([seqs_number, seqs_number])
    for i, j in itertools.combinations(range(0,seqs_number),2):
         matrix[i][j]= matrix [j][i] = np.linalg.norm((list_[i,:]-list_[j,:]),ord=exponent)
        #  (np.sum((np.absolute(list_[i,:] - list_[j,:]))**exponent))**(1.0/float(exponent))
    return matrix
def euclidean(list_,seqs_number):
    return minkowski(list_,seqs_number,2)

In [32]:
matrix = euclidean(cont_,len(cont_))
print(matrix.shape)

(137, 137)


In [33]:
final_df = pd.DataFrame(matrix,columns=ID_arr)
final_df.to_csv(c_name+"_fast_distance_matrix.csv",index=False)

What the hell is this behaviour Puja?

#

In [35]:
Test_list = final_list.copy()
Test_list = np.array(Test_list)
print(Test_list.shape)

(2069, 18)


In [51]:
Train_list = final_list.copy()
Train_list = np.array(Train_list)
print(Train_list.shape)

(8110, 18)


In [53]:
final = np.concatenate((Train_list,Test_list))
Train_Id = df["Accession ID"]

print(final.shape)



(10179, 18)


In [55]:
Test_Id = df["Accession ID"]

ID_Final = np.concatenate((Train_Id,Test_Id))
print(ID_Final.shape)


(10179,)


In [59]:
ID_col = pd.Series(ID_Final)
Vector_col = pd.Series(final.tolist())

frame = {'Accession ID': ID_col,'Fast Vector':Vector_col}
df_final = pd.DataFrame(frame)

In [61]:
df_final.to_csv("Fast_Vector_GisAID.csv",index=False)