In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from itertools import permutations

import networkx as nx

import os

In [2]:
def print_edges(edges):
    print(','.join([str(e) for e in edges]))

# Табличка с референсами

Я переименовавыла контиги одного референса таким образом, чтобы они были в формате 

*имяРеференса_номерКонтига*

In [3]:
!head -5 refs_edges.txt

5515248	s7_3	s9_8	s5_2
5607954	s7_3	s5_2
5601674	s7_10	s9_27	s5_1
5427068	s7_10	s5_1
5546564	s9_27


Считываем файл ответа sequence-threader, как он есть

In [4]:
df_ref = pd.read_csv("refs_edges.txt", header=None, names=["e"])

df_ref = df_ref["e"].str.split('\t', 1, expand=True)
df_ref.columns = ["e_id", "strains"]
df_ref = df_ref.set_index("e_id")
df_ref.index = df_ref.index.astype("int")
df_ref.loc[df_ref["strains"].isnull(), "strains"] = "nobody_0"
df_ref.head()

Unnamed: 0_level_0,strains
e_id,Unnamed: 1_level_1
5515248,s7_3\ts9_8\ts5_2
5607954,s7_3\ts5_2
5601674,s7_10\ts9_27\ts5_1
5427068,s7_10\ts5_1
5546564,s9_27


Сплитим список референсов:

In [5]:
df_ref["strains"] = df_ref["strains"].str.split('\t')
df_ref["strains"] = df_ref["strains"].apply(lambda x: [s.rpartition('_')[0] for s in x])
df_ref["strains"] = df_ref["strains"].apply(Counter)
df_ref.head()

Unnamed: 0_level_0,strains
e_id,Unnamed: 1_level_1
5515248,"{'s7': 1, 's9': 1, 's5': 1}"
5607954,"{'s7': 1, 's5': 1}"
5601674,"{'s7': 1, 's9': 1, 's5': 1}"
5427068,"{'s7': 1, 's5': 1}"
5546564,{'s9': 1}


Считаем копийность каждого ребра:

In [6]:
df_ref["single_copy"] = df_ref["strains"].apply(lambda x: x.most_common(1)[0][1] == 1)
df_ref.head()

Unnamed: 0_level_0,strains,single_copy
e_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5515248,"{'s7': 1, 's9': 1, 's5': 1}",True
5607954,"{'s7': 1, 's5': 1}",True
5601674,"{'s7': 1, 's9': 1, 's5': 1}",True
5427068,"{'s7': 1, 's5': 1}",True
5546564,{'s9': 1},True


# Считываем профили

In [7]:
ref_profile = pd.read_csv("profile.csv", header=None, index_col=0)
for i in range(1, 11):
    ref_profile[i] = ref_profile[i] / ref_profile[i].sum()
ref_profile

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7,0.44,0.13,0.1,0.42,0.06,0.55,0.76,0.3,0.22,0.23
9,0.55,0.84,0.11,0.53,0.12,0.32,0.14,0.2,0.78,0.1
5,0.01,0.03,0.79,0.05,0.82,0.13,0.1,0.5,0.0,0.67


In [8]:
desman_profile = pd.read_csv("desman_freqs.csv",
                             header=None, index_col=0, dtype=float)
desman_profile.index = desman_profile.index.astype(int)
desman_profile

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.494,0.164,0.121,0.475,0.074,0.582,0.773,0.32,0.261,0.252
1,0.494,0.802,0.098,0.472,0.108,0.286,0.122,0.179,0.736,0.091
2,0.012,0.034,0.781,0.053,0.818,0.132,0.105,0.501,0.002,0.658


Ищем соответствие между профилями:

In [9]:
ref_freqs = ref_profile.as_matrix()
ans_error = float("Inf")
ans_permut = None
for cur_permut in permutations(desman_profile.index):
    desman_freqs = desman_profile.loc[cur_permut, :].as_matrix()
    #print(cur_error, cur_permut)
    cur_error = ((ref_freqs - desman_freqs) ** 2).sum()
    if cur_error < ans_error:
        ans_error = cur_error
        ans_permut = cur_permut
print("Error:", ans_error)

Error: 0.023954


In [10]:
def invert_permutation(permutation):
    return [i for i, j in sorted(enumerate(permutation), key=lambda x: x[1])]

In [11]:
strains = list('s' + ref_profile.iloc[invert_permutation(ans_permut), :].index.astype(str))
strains

['s7', 's9', 's5']

# Табличка ответов DESMAN

In [12]:
!head -5 gene_assignment_etaS_df.csv

,0,1,2
e5515248,1.0,1.0,1.0
e5607954,0.0,0.0,1.0
e5601674,1.0,1.0,1.0
e5427068,1.0,0.0,1.0


In [13]:
df_desman = pd.read_csv("gene_assignment_etaS_df.csv", skiprows=1, names=["e_id"] + strains)
df_desman['e_id'] = df_desman['e_id'].str[1:].astype("int")
df_desman = df_desman.set_index('e_id')
df_desman[strains] = df_desman[strains].astype('int')

df_desman.head()

Unnamed: 0_level_0,s7,s9,s5
e_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5515248,1,1,1
5607954,0,0,1
5601674,1,1,1
5427068,1,0,1
5546564,0,1,0


In [14]:
for cur_s in strains:
    df_ref[cur_s] = df_ref['strains'].apply(lambda x: int(cur_s in x))
    
df_ref.head()

Unnamed: 0_level_0,strains,single_copy,s7,s9,s5
e_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5515248,"{'s7': 1, 's9': 1, 's5': 1}",True,1,1,1
5607954,"{'s7': 1, 's5': 1}",True,1,0,1
5601674,"{'s7': 1, 's9': 1, 's5': 1}",True,1,1,1
5427068,"{'s7': 1, 's5': 1}",True,1,0,1
5546564,{'s9': 1},True,0,1,0


# Точность DESMAN

In [15]:
df_ref.sort_index(inplace=True)
df_desman.sort_index(inplace=True)

In [16]:
right_answers = (df_ref[strains] == df_desman[strains]).sum(axis=1) == len(strains)
print("Accuracy on all edges: %.2f" % (right_answers.sum() / len(df_ref)))

Accuracy on all edges: 0.80


# Раскрашиваем граф для каждого штамма

In [17]:
if not os.path.exists("bandage_colors"):
    os.makedirs("bandage_colors")


for cur_s in strains:
    
    print('\n\n_______________', cur_s)

    df_ref['color'] = "#b0b0b0"  # grey


    #long = df_ref['length'] >= 500
    single = df_ref['single_copy']
    real_true = df_ref[cur_s] == 1
    desman_true = df_desman[cur_s] == 1

    #df_ref.loc[~long & real_true, 'color'] = 'Brown'

    df_ref.loc[ single  &  real_true  &  desman_true, 'color'] = 'Lime'
    df_ref.loc[~single  &  real_true  &  desman_true, 'color'] = 'Green'

    df_ref.loc[ single  &  real_true  & ~desman_true, 'color'] = 'Teal'
    df_ref.loc[~single  &  real_true  & ~desman_true, 'color'] = 'Navy'

    df_ref.loc[ single  & ~real_true  &  desman_true, 'color'] = 'Yellow'
    df_ref.loc[~single  & ~real_true  &  desman_true, 'color'] = 'Orange'


    df_ref['strains_print'] = df_ref['strains'].apply(
        lambda x: ", ".join('{}({})'.format(k, v) for k, v in x.items()))
    df_ref['strains_print'] = df_ref['strains_print'].apply(lambda x: x.replace('(1)', ''))

    df_ref[['strains_print', 'color']].to_csv("bandage_colors/{}.csv".format(cur_s), index_label='name')
    
    print("\nFN")
    print_edges(df_ref[real_true & ~desman_true].index)
    
    print("\nFP")
    print_edges(df_ref[~real_true & desman_true].index)



_______________ s7

FN
46552,325316,362196,482888,555284,648310,884642,902088,1156174,1219216,1233698,1253944,4691195,4711570,4809726,4865236,4879836,4899716,5047786,5056686,5161862,5190986,5210786,5221422,5229452,5239610,5243762,5260758,5318210,5321426,5328452,5336482,5337232,5338210,5338838,5343482,5346520,5347056,5347476,5354274,5354634,5355352,5365868,5375578,5388898,5399188,5402840,5409120,5414356,5417722,5419506,5419602,5426606,5442114,5453454,5453634,5460092,5462924,5466008,5483818,5483820,5486206,5488202,5490028,5490178,5491812,5491840,5498078,5499888,5500622,5502848,5506160,5510576,5514920,5516648,5522364,5527192,5535846,5536118,5536138,5536720,5536766,5537190,5539754,5541886,5541970,5542390,5542842,5543492,5544658,5545752,5547076,5559598,5570250,5570426,5570504,5572448,5574208,5574404,5575730,5578572,5580860,5580908,5582100,5582320,5582366,5584076,5585704,5586430,5586432,5586526,5586548,5589210,5589592,5589926,5590268,5590852,5591328,5592242,5592254,5592948,5593026,5593740,

Теперь в папке bandage_colors лежит раскраска для каждого из штаммов соответственно

In [18]:
!ls bandage_colors

s5.csv	s7.csv	s9.csv


In [20]:
!head bandage_colors/s5.csv

name,strains_print,color
1924,nobody,#b0b0b0
1926,nobody,#b0b0b0
6308,"s7, s9",#b0b0b0
18054,nobody,Yellow
19100,nobody,#b0b0b0
19102,nobody,#b0b0b0
19556,nobody,#b0b0b0
19572,nobody,#b0b0b0
20152,nobody,Yellow
