In [1]:
import itertools
import os
import numpy as np
import sys
from pprint import pprint
import pandas as pd
import random
from collections import Counter
from Bio import SeqIO

In [2]:
def encode(seq,change_map):
    n_seq = []
    for base in seq:
        if base in change_map:
            n_seq.append(change_map[base])
        else:
            print("Problem!!!")
            print(base)
            print(len(seq),len(n_seq))
    
    assert len(seq) == len(n_seq)
    return n_seq

In [3]:
def R_Y_coding(seq):
    change_map = {'A':'R','G':'R','C':'Y','T':'Y'}
    # pprint(change_map)
    
    n_seq = encode(seq,change_map)
    unique,count = np.unique(n_seq,return_counts=True)
    n_r = count[0]
    n_y = count[1]
    # print(unique,count)
    ry_encode_seq = ''.join(n_seq)
    # print(ry_encode_seq)
    return (ry_encode_seq,n_r,n_y)

def M_K_coding(seq):
    change_map = {'A':'M','G':'K','C':'M','T':'K'}
    # pprint(change_map)

    n_seq = encode(seq,change_map)
    counts = Counter(n_seq)
    # print(counts)
    n_m = counts['M']
    n_k = counts['K']
    mk_encode_seq = ''.join(n_seq)
    return (mk_encode_seq,n_m,n_k)

def S_W_coding(seq):
    change_map = {'A':'W','G':'S','C':'S','T':'W'}
    # pprint(change_map)

    n_seq = encode(seq,change_map)
    counts = Counter(n_seq)
    # print(counts)
    n_s = counts['S']
    n_w = counts['W']
    sw_encode_seq = ''.join(n_seq)
    return (sw_encode_seq,n_s,n_w)


# print(mk_encode_seq,n_m,n_k)

In [4]:
def get_mean_position(seq,n_1,c):
    length = len(seq)
    sum = 0
    for idx in range(length):
        if(seq[idx] == c):
            sum+= (idx*(1.0/n_1))
    return sum

In [5]:
def get_variance(seq,n_1,meu,c):
    length = len(seq)
    sum = 0
    for i in range(length):
        if(seq[i] == c):
            sum += (((i-meu)**2)*1.0)/(n_1*length)
    return sum

In [6]:
def get_NFV(seq):
    (ry_encode_seq,n_r,n_y)  = R_Y_coding(seq)
    (mk_encode_seq,n_m,n_k) = M_K_coding(seq)
    (sw_encode_seq,n_s,n_w) = S_W_coding(seq)

    meu_r = get_mean_position(ry_encode_seq,n_r,'R')
    meu_y = get_mean_position(ry_encode_seq,n_y,'Y')

    meu_m = get_mean_position(mk_encode_seq,n_m,'M')
    meu_k = get_mean_position(mk_encode_seq,n_k,'K')

    meu_s = get_mean_position(sw_encode_seq,n_s,'S')
    meu_w = get_mean_position(sw_encode_seq,n_w,'W')

    D_r = get_variance(ry_encode_seq,n_r,meu_r,'R')
    D_y = get_variance(ry_encode_seq,n_y,meu_y,'Y')

    D_m = get_variance(mk_encode_seq,n_m,meu_m,'M')
    D_k = get_variance(mk_encode_seq,n_k,meu_k,'K')

    D_s = get_variance(sw_encode_seq,n_s,meu_s,'S')
    D_w = get_variance(sw_encode_seq,n_w,meu_w,'W')
    
    Fast_vector = [n_r,meu_r,D_r,n_y,meu_y,D_y,n_m,meu_m,D_m,n_k,meu_k,D_k,n_s,meu_s,D_s,n_w,meu_w,D_w]
    assert len(Fast_vector) == 18
    return Fast_vector

In [7]:
def minkowski(list_,seqs_number,exponent): 
    matrix = np.zeros([seqs_number, seqs_number])
    for i, j in itertools.combinations(range(0,seqs_number),2):
         matrix[i][j]= matrix [j][i] = np.linalg.norm((list_[i,:]-list_[j,:]),ord=exponent)
        #  (np.sum((np.absolute(list_[i,:] - list_[j,:]))**exponent))**(1.0/float(exponent))
    return matrix
def euclidean(list_,seqs_number):
    return minkowski(list_,seqs_number,2)

In [9]:
seq_fl = SeqIO.parse("sequences_2.fasta","fasta")

map_ = {}
for sequ in seq_fl:
    ref_seq = str(sequ.seq)
    seq_id = sequ.id 
    print(sequ.description)
    map_["Accession ID"]= seq_id
    map_["Virus_name"] = sequ.description
    map_["Location"] = "Reference/China/Wuhan"
    map_["Collection Date"] = "2019-12-18"
    map_["Sequence"] = ref_seq

pprint(map_)
ref_seq_df = pd.DataFrame(map_,index=[0])
ref_seq_df.to_csv("All_Countries_Splitted/Ref_seq.csv",index=False)
print(len(ref_seq))

NC_045512 |Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1| complete genome
{'Accession ID': 'NC_045512',
 'Collection Date': '2019-12-18',
 'Location': 'Reference/China/Wuhan',
 'Sequence': 'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCG

In [10]:
files = os.listdir('All_Countries_Splitted')
main_dir = 'All_Countries_Splitted'
if not os.path.exists("All_Countries_NFV_Vects"):
    os.mkdir("All_Countries_NFV_Vects")
if not os.path.exists("All_Countries_Distance_Matrix"):
    os.mkdir("All_Countries_Distance_Matrix")

NFV_Vects = os.listdir("All_Countries_NFV_Vects")
# contries_ = []
# for fl in NFV_Vects:
#     contries_.append(fl.split("_")[0])

for file_ in files:
    
    inp_file = main_dir + "/"+file_
    c_name = file_.split(".")[0]
    # if(c_name in contries_):
    #     continue
    df = pd.read_csv(inp_file)
    
    
    print("Started working with ->",file_)
    sequences = df["Sequence"]
    print("n_seq->",sequences.shape)
    final_list = []
    count = 0
    # for seq in sequences:

    #     n_seq = encode()
    for seq in sequences:
        Fast_vector = get_NFV(seq)
        final_list.append(Fast_vector)
        if(count%100 == 0):
            print(count)
        count +=1
        # print(Fast_vector)
    
    acc_vects = np.array(final_list)
    print(acc_vects.shape)
    cont_ = np.array(acc_vects)
    # print(cont_.shape)
    ID_arr = df["Accession ID"]
    # print(ID_arr.shape)
    ID_col = pd.Series(ID_arr)
    Vector_col = pd.Series(cont_.tolist())
    frame = {'Accession ID': ID_col,'Fast Vector':Vector_col}
    df_final = pd.DataFrame(frame)
    direc_1 = "All_Countries_NFV_Vects"
    df_final.to_csv(direc_1+"/"+c_name+"_Novel_Fast_vector.csv",index=False)
    direc_2 = "All_Countries_Distance_Matrix"
    matrix = euclidean(cont_,len(cont_))
    print(matrix.shape)
    final_df = pd.DataFrame(matrix,columns=ID_arr)
    final_df.to_csv(direc_2+"/"+c_name+"_accumulated_distance_matrix.csv",index=False)


Started working with -> Finland.csv
n_seq-> (40,)
0
(40, 18)
(40, 40)
Started working with -> Luxembourg.csv
n_seq-> (86,)
0
(86, 18)
(86, 86)
Started working with -> Algeria.csv
n_seq-> (3,)
0
(3, 18)
(3, 3)
Started working with -> Argentina.csv
n_seq-> (3,)
0
(3, 18)
(3, 3)
Started working with -> Australia.csv
n_seq-> (1045,)
0
100
200
300
400
500
600
700
800
900
1000
(1045, 18)
(1045, 1045)
Started working with -> Austria.csv
n_seq-> (21,)
0
(21, 18)
(21, 21)
Started working with -> Bangladesh.csv
n_seq-> (7,)
0
(7, 18)
(7, 7)
Started working with -> Belarus.csv
n_seq-> (2,)
0
(2, 18)
(2, 2)
Started working with -> Belgium.csv
n_seq-> (386,)
0
100
200
300
(386, 18)
(386, 386)
Started working with -> Brazil.csv
n_seq-> (34,)
0
(34, 18)
(34, 34)
Started working with -> Cambodia.csv
n_seq-> (1,)
0
(1, 18)
(1, 1)
Started working with -> Canada.csv
n_seq-> (130,)
0
100
(130, 18)
(130, 130)
Started working with -> Chile.csv
n_seq-> (7,)
0
(7, 18)
(7, 7)
Started working with -> China.csv


In [28]:
inp_file = "Others_Af_CA_SA.csv" #Test_gisaid_LS.csv/Train_gisaid_LS.csv
# length = sys.argv[2]
df = pd.read_csv(inp_file)


sequences = df["Sequence"]
# seq = sequences[0]
c_name = "Others"

In [31]:
cont_ = np.array(final_list)
print(cont_.shape)
ID_arr = df["Accession ID"]
print(ID_arr.shape)

ID_col = pd.Series(ID_arr)
Vector_col = pd.Series(cont_.tolist())

frame = {'Accession ID': ID_col,'Fast Vector':Vector_col}
df_final = pd.DataFrame(frame)

df_final.to_csv(c_name+"_Fast_vector.csv",index=False)

(137, 18)
(137,)


In [19]:
def minkowski(list_,seqs_number,exponent): 
    matrix = np.zeros([seqs_number, seqs_number])
    for i, j in itertools.combinations(range(0,seqs_number),2):
         matrix[i][j]= matrix [j][i] = np.linalg.norm((list_[i,:]-list_[j,:]),ord=exponent)
        #  (np.sum((np.absolute(list_[i,:] - list_[j,:]))**exponent))**(1.0/float(exponent))
    return matrix
def euclidean(list_,seqs_number):
    return minkowski(list_,seqs_number,2)

In [32]:
matrix = euclidean(cont_,len(cont_))
print(matrix.shape)

(137, 137)


In [33]:
final_df = pd.DataFrame(matrix,columns=ID_arr)
final_df.to_csv(c_name+"_fast_distance_matrix.csv",index=False)

What the hell is this behaviour Puja?

#

In [35]:
Test_list = final_list.copy()
Test_list = np.array(Test_list)
print(Test_list.shape)

(2069, 18)


In [51]:
Train_list = final_list.copy()
Train_list = np.array(Train_list)
print(Train_list.shape)

(8110, 18)


In [53]:
final = np.concatenate((Train_list,Test_list))
Train_Id = df["Accession ID"]

print(final.shape)



(10179, 18)


In [55]:
Test_Id = df["Accession ID"]

ID_Final = np.concatenate((Train_Id,Test_Id))
print(ID_Final.shape)


(10179,)


In [59]:
ID_col = pd.Series(ID_Final)
Vector_col = pd.Series(final.tolist())

frame = {'Accession ID': ID_col,'Fast Vector':Vector_col}
df_final = pd.DataFrame(frame)

In [61]:
df_final.to_csv("Fast_Vector_GisAID.csv",index=False)