# Analysis of the results of the NNI+DeCoSTAR experiments of 2018_07_15-16-25
Cedric Chauve, July 2018, 26

In [1]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Overview

This notebook contains an analysis of the results of the experiments consisting into 
* simulating the evolution of genomes (gene trees and gene orders) using <a href="https://www.biorxiv.org/content/early/2018/06/07/339473">Zombi</a>;
* adding noise by performing NNI tree rearrangements, with various levels of noise;
* reconstructing ancestral adjacencies using <a href="https://doi.org/10.1093/gbe/evx069">DeCoSTAR</a>;
* recording statistics about the scores of reconciliation, DeCoSTAR and linearity of ancestral gene orders
in order to assess if the linearity of ancestral gene orders can be used as a diagnostic measure of correctness of the gene trees.

The experiments were run on two datasets of simulated gene trees, one simulated with no HGT and one simulated with HGT, respectively called the noHGT and HGT datasets.

## Data and results

The data and results of the experiments for the noHGT dataset are available in the directory ../../exp/2018_07_15_D10T0L20I10T10_NNI_DECOSTAR. The data and results of the experiments for the noHGT dataset are available in the directory ../../exp/2018_07_16_D10T10L20I10T10_NNI_DECOSTAR. The data and results of the experiments for the noDUP dataset are available in the directory ../../exp/2018_07_25_D0T10L20I10T10_NNI_DECOSTAR.

**TO DO**: describe in more detail the experiments.

In each of these directory, a summary file of the results is available in the file results/summary_1.

## Analysis of the results with the noHGT dataset.

In [3]:
# Reading summary file (path to file, F) and creating a dataframe

# Correspondance between dataset names and lambda parameter of the Poisson law used to determine the average number of NNI per gene tree
LAMBDA={'lambda_025':0.25,'lambda_05':0.5,'lambda_1':1,'lambda_2':2,'lambda_3':3,'lambda_5':5,'lambda_7':7,'lambda_10':10,'lambda_20':20,'lambda_30':30,'lambda_50'
:50}

def read_summary_file(F):
    SUMMARY_FILE = open(F,'r').readlines()
    SUMMARY_AUX  = {}
    for l in SUMMARY_FILE:
        if l[0]!="#":
            l1 = l.rstrip().split()
            (name,plambda,rf,gdup,gloss,hgt,rec,again,abreak,deco,linearity) = (l1[0],float(LAMBDA[l1[0]]),float(l1[1]),float(l1[2]),float(l1[3]),float(l1[4]),float(l1[5]),float(l1[6]),float(l1[7]),float(l1[8]),float(l1[9]))
            SUMMARY_AUX[name] = (plambda,rf,gdup,gloss,hgt,rec,again,abreak,deco,linearity)  
    SUMMARY = pd.DataFrame.transpose(pd.DataFrame(SUMMARY_AUX))
    SUMMARY.columns=['Lambda','RF','nb_dup','nb_loss','nb_hgt','rec_score','nb_gain','nb_break','DeCo_score','linearity_score']
    return(SUMMARY)

In [4]:
NOHGT_SUMMARY=read_summary_file('../../exp/2018_07_15_D10T0L20I10T10_NNI_DECOSTAR/results/summary_1')

In [5]:
NOHGT_SUMMARY

Unnamed: 0,Lambda,RF,nb_dup,nb_loss,nb_hgt,rec_score,nb_gain,nb_break,DeCo_score,linearity_score
lambda_025,0.25,0.13,183.0,528.0,0.0,894.0,1782.0,294.0,5640.0,592.0
lambda_05,0.5,0.22,273.0,815.0,0.0,1361.0,1961.0,287.0,6170.0,980.0
lambda_1,1.0,0.47,503.0,1537.0,0.0,2543.0,2399.0,279.0,7476.0,1956.0
lambda_10,10.0,3.97,3599.0,11841.0,0.0,19039.0,7467.0,140.0,22541.0,13336.0
lambda_2,2.0,0.96,974.0,3027.0,0.0,4975.0,3259.0,249.0,10026.0,3834.0
lambda_20,20.0,6.88,5923.0,20537.0,0.0,32383.0,10404.0,111.0,31323.0,21152.0
lambda_3,3.0,1.36,1361.0,4238.0,0.0,6960.0,3932.0,232.0,12028.0,5234.0
lambda_30,30.0,8.88,7235.0,26320.0,0.0,40790.0,12122.0,88.0,36454.0,26198.0
lambda_5,5.0,2.16,2097.0,6643.0,0.0,10837.0,5206.0,207.0,15825.0,8014.0
lambda_50,50.0,11.9,9119.0,35959.0,0.0,54197.0,14232.0,53.0,42749.0,34026.0


In [6]:
# Functions to generate the plots

def plot_RF(df_summary,name):
    # RF distance versus lambda
    plt.scatter(df_summary['Lambda'], df_summary['RF'])
    plt.xlabel("Lambda")
    plt.ylabel("RF distance")
    plt.savefig(name+'_RF.png') 
    plt.close()

def plot_scores(df_summary,name,suffix):

    # Plotting the three scores
    plt.scatter(df_summary['RF'], df_summary['rec_score'], c='g', marker='o')
    plt.scatter(df_summary['RF'], df_summary['DeCo_score'], c='b', marker='x')
    plt.scatter(df_summary['RF'], df_summary['linearity_score'], c='r', marker='+')
    plt.xlabel("RF distance")
    plt.ylabel("Scores")
    plt.legend(['reconciliation','DeCoSTAR','linearity'],loc=2)
    plt.savefig(name+'_scores'+suffix+'.png') 
    plt.close()

    # Plotting rec+DeCo scores
    plt.scatter(df_summary['RF'], df_summary['rec_score'], c='g', marker='o')
    plt.scatter(df_summary['RF'], df_summary['DeCo_score'], c='b', marker='x')
    plt.xlabel("RF distance")
    plt.ylabel("Scores")
    plt.legend(['reconciliation','DeCoSTAR'],loc=2)
    plt.savefig(name+'_scores_rec_DeCo'+suffix+'.png') 
    plt.close()

    # Plotting linearity score
    plt.scatter(df_summary['RF'], df_summary['linearity_score'], c='r', marker='+')
    plt.xlabel("RF distance")
    plt.ylabel("Linearity score")
    plt.savefig(name+'_linearity'+suffix+'.png') 
    plt.close()

In [7]:
plot_RF(NOHGT_SUMMARY,'2018_07_15_D10T0L20I10T10_NNI_DECOSTAR')
plot_scores(NOHGT_SUMMARY,'2018_07_15_D10T0L20I10T10_NNI_DECOSTAR','')

<img src="2018_07_15_D10T0L20I10T10_NNI_DECOSTAR_RF.png">
<img src="2018_07_15_D10T0L20I10T10_NNI_DECOSTAR_scores.png">
<img src="2018_07_15_D10T0L20I10T10_NNI_DECOSTAR_scores_rec_DeCo.png">
<img src="2018_07_15_D10T0L20I10T10_NNI_DECOSTAR_linearity.png">

## Analysis of the results with the HGT dataset.

In [8]:
HGT_SUMMARY=read_summary_file('../../exp/2018_07_16_D10T10L20I10T10_NNI_DECOSTAR/results/summary_1')

In [9]:
HGT_SUMMARY

Unnamed: 0,Lambda,RF,nb_dup,nb_loss,nb_hgt,rec_score,nb_gain,nb_break,DeCo_score,linearity_score
lambda_05,0.5,0.25,121.0,422.0,314.0,1606.0,2143.0,701.0,7130.0,182.0
lambda_1,1.0,0.45,120.0,599.0,513.0,2378.0,2530.0,875.0,8465.0,272.0
lambda_10,10.0,3.96,103.0,2925.0,3468.0,13535.0,7906.0,2586.0,26304.0,2420.0
lambda_2,2.0,0.93,115.0,1009.0,967.0,4140.0,3381.0,1203.0,11346.0,482.0
lambda_20,20.0,6.65,112.0,4111.0,5276.0,20163.0,10960.0,2785.0,35665.0,4700.0
lambda_3,3.0,1.38,112.0,1356.0,1377.0,5711.0,4139.0,1558.0,13975.0,682.0
lambda_30,30.0,8.93,130.0,4893.0,6637.0,25064.0,13173.0,2838.0,42357.0,6790.0
lambda_5,5.0,2.18,115.0,1867.0,2021.0,8160.0,5323.0,1905.0,17874.0,1084.0
lambda_50,50.0,11.88,141.0,5812.0,8140.0,30514.0,15542.0,2824.0,49450.0,9298.0
lambda_7,7.0,2.97,111.0,2359.0,2690.0,10651.0,6511.0,2302.0,21835.0,1636.0


In [10]:
plot_RF(HGT_SUMMARY,'2018_07_16_D10T10L20I10T10_NNI_DECOSTAR')
plot_scores(HGT_SUMMARY,'2018_07_16_D10T10L20I10T10_NNI_DECOSTAR','')

<img src="2018_07_16_D10T10L20I10T10_NNI_DECOSTAR_RF.png">
<img src="2018_07_16_D10T10L20I10T10_NNI_DECOSTAR_scores.png">
<img src="2018_07_16_D10T10L20I10T10_NNI_DECOSTAR_scores_rec_DeCo.png">
<img src="2018_07_16_D10T10L20I10T10_NNI_DECOSTAR_linearity.png">

## Analysis of the results with the HGT dataset.

We ran two DeCoSTAR experiments: mone with the usual weighting scheme for reconciliation (experiment 1) and one where both Dup and HGT have weight 2 (experiment 2).

In [15]:
NODUP_SUMMARY_1=read_summary_file('../../exp/2018_07_25_D0T10L20I10T10_NNI_DECOSTAR/results/summary_1')
NODUP_SUMMARY_2=read_summary_file('../../exp/2018_07_25_D0T10L20I10T10_NNI_DECOSTAR/results/summary_2')

In [16]:
NODUP_SUMMARY_1

Unnamed: 0,Lambda,RF,nb_dup,nb_loss,nb_hgt,rec_score,nb_gain,nb_break,DeCo_score,linearity_score
lambda_025,0.25,0.12,0.0,298.0,193.0,877.0,1812.0,560.0,5996.0,156.0
lambda_05,0.5,0.23,0.0,403.0,298.0,1297.0,2008.0,628.0,6652.0,180.0
lambda_1,1.0,0.47,0.0,620.0,527.0,2201.0,2465.0,900.0,8295.0,228.0
lambda_10,10.0,3.8,10.0,2830.0,3298.0,12744.0,7470.0,2372.0,24782.0,2244.0
lambda_2,2.0,0.88,1.0,983.0,915.0,3730.0,3201.0,1222.0,10825.0,354.0
lambda_20,20.0,6.57,21.0,4078.0,5176.0,19648.0,10679.0,2791.0,34828.0,4692.0
lambda_3,3.0,1.35,2.0,1323.0,1321.0,5290.0,3947.0,1509.0,13350.0,614.0
lambda_30,30.0,8.56,47.0,4846.0,6342.0,23966.0,12633.0,2893.0,40792.0,6398.0
lambda_5,5.0,2.11,1.0,1860.0,1991.0,7835.0,5144.0,1915.0,17347.0,1024.0
lambda_50,50.0,11.51,64.0,5742.0,7845.0,29405.0,14929.0,2686.0,47473.0,8820.0


In [17]:
NODUP_SUMMARY_2

Unnamed: 0,Lambda,RF,nb_dup,nb_loss,nb_hgt,rec_score,nb_gain,nb_break,DeCo_score,linearity_score
lambda_025,0.25,0.12,0.0,294.0,195.0,684.0,1815.0,553.0,5998.0,160.0
lambda_05,0.5,0.23,0.0,399.0,300.0,999.0,2011.0,621.0,6654.0,200.0
lambda_1,1.0,0.47,0.0,604.0,535.0,1674.0,2477.0,879.0,8310.0,262.0
lambda_10,10.0,3.8,2.0,2696.0,3368.0,9436.0,7572.0,2281.0,24997.0,2376.0
lambda_2,2.0,0.88,0.0,973.0,921.0,2815.0,3218.0,1209.0,10863.0,366.0
lambda_20,20.0,6.57,0.0,3701.0,5372.0,14445.0,10894.0,2541.0,35223.0,4950.0
lambda_3,3.0,1.35,1.0,1306.0,1330.0,3968.0,3962.0,1491.0,13377.0,642.0
lambda_30,30.0,8.56,1.0,4136.0,6709.0,17556.0,12939.0,2403.0,41220.0,6972.0
lambda_5,5.0,2.11,1.0,1828.0,2007.0,5844.0,5178.0,1871.0,17405.0,1112.0
lambda_50,50.0,11.51,2.0,4500.0,8470.0,21444.0,15381.0,2045.0,48188.0,9368.0


In [19]:
plot_RF(NODUP_SUMMARY_1,'2018_07_25_D0T10L20I10T10_NNI_DECOSTAR')
plot_scores(NODUP_SUMMARY_1,'2018_07_25_D0T10L20I10T10_NNI_DECOSTAR','_1')
plot_scores(NODUP_SUMMARY_2,'2018_07_25_D0T10L20I10T10_NNI_DECOSTAR','_2')

<img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_RF.png">

￼<table align="center">
<tr>
    <th>Experiment 1: HGT cost=3, Dup cost=2, Loss cost=1</th>
    <th>Experiment 2: HGT cost=2, Dup cost=2, Loss cost=1</th>
</tr>
￼<tr>
    <td><img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_scores_1.png"></td>
    <td><img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_scores_2.png"></td>
</tr>
<tr>
    <td><img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_scores_rec_DeCo_1.png"></td>
    <td><img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_scores_rec_DeCo_2.png"></td>
</tr>
<tr>
    <td><img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_linearity_1.png"></td>
    <td><img src="2018_07_25_D0T10L20I10T10_NNI_DECOSTAR_linearity_2.png"></td>
</tr>
 </table>