In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import re

In [9]:
path = "/Users/rx32940/Dropbox/5.Rachel-projects/Phylogeography/Lepto_assemblies_V2"

date_data = pd.read_csv(path + "/date/multiqc_quast.txt", sep="\t")
rest_data = pd.read_csv(path  + "/rest/multiqc_quast.txt", sep="\t")
picardeau_data = pd.read_csv(path + "/picardeau/quast/multiqc_quast.txt", sep="\t")
data = pd.concat([date_data,rest_data,picardeau_data],ignore_index=True,sort=False)

data["Sample"] = data["Sample"].str.split(" | ").str[0]
data_subset=data[["Sample","N50","# genomic features","# contigs","Genome fraction (%)","Total length"]]
data_subset



Unnamed: 0,Sample,N50,# genomic features,# contigs,Genome fraction (%),Total length
0,SAMN03944914,28648.0,6933.0,319.0,94.552,4544380.0
1,SAMN03996887,28134.0,6940.0,316.0,94.553,4542750.0
2,SAMN03996952,29686.0,6935.0,309.0,94.537,4542700.0
3,SAMN03996955,28786.0,6936.0,317.0,94.530,4542419.0
4,SAMN03997006,26234.0,6930.0,320.0,94.528,4542896.0
5,SAMN03997015,28648.0,6930.0,324.0,94.572,4543879.0
6,SAMN03998754,29300.0,6933.0,319.0,94.556,4543029.0
7,SAMN03998756,29311.0,6929.0,314.0,94.539,4542033.0
8,SAMN03998757,26060.0,6934.0,327.0,94.532,4542912.0
9,SAMN04002971,28135.0,6931.0,327.0,94.550,4543943.0


### Re-quast Scaffolds with Low Genome Fraction

- evaulate scaffolds with **genome fraction lower than 70** from QUAST report
- low genome fraction isolates were checked on NCBI with the tool **STAT** for **mis-identified species**
  - keep those with STAT classified **at least 70% reads** to the genus _Leptospira_
- requast these isolates with **STAT identified species's reference genome** (if identified as _Leptospira sp._, use _L. interrogans_)
- concate those with high genome fraction identified with the original dataframe

In [10]:
# filtered out isolates with high mapping genome fraction
genome_frac = data_subset[data_subset["Genome fraction (%)" ] > 70] 


In [11]:
# low mapping isolates was identified, searched on NCBI with STAT identified new species, re-quasted
low_map_picardeau = pd.read_csv(path+"/picardeau/quast/low_map_multiqc_quast.txt", sep="\t")
low_map_rest = pd.DataFrame(pd.read_csv(path + "/rest/low_map_multiqc_quast.txt",sep="\t"))

# concate re-quasted isolates from two datasets, excluding SAMN01919665
low_map = pd.concat([low_map_picardeau,low_map_rest],sort=False, ignore_index=True).loc[0:8,:]
low_map["Sample"]=low_map["Sample"].str.split(" | ").str[0]

# reformat the low-map data frame so it can concatenated with the orginal data frame
low_map_subset = low_map[["Sample","N50","# genomic features","# contigs","Genome fraction (%)"]]


In [12]:
# concate orginal high genome frac isolates, re-mapped&quasted islates together
all_self_assemblies = pd.concat([genome_frac,low_map_subset], ignore_index=True, sort=False)
all_self_assemblies

Unnamed: 0,Sample,N50,# genomic features,# contigs,Genome fraction (%),Total length
0,SAMN03944914,28648.0,6933.0,319.0,94.552,4544380.0
1,SAMN03996887,28134.0,6940.0,316.0,94.553,4542750.0
2,SAMN03996952,29686.0,6935.0,309.0,94.537,4542700.0
3,SAMN03996955,28786.0,6936.0,317.0,94.530,4542419.0
4,SAMN03997006,26234.0,6930.0,320.0,94.528,4542896.0
5,SAMN03997015,28648.0,6930.0,324.0,94.572,4543879.0
6,SAMN03998754,29300.0,6933.0,319.0,94.556,4543029.0
7,SAMN03998756,29311.0,6929.0,314.0,94.539,4542033.0
8,SAMN03998757,26060.0,6934.0,327.0,94.532,4542912.0
9,SAMN04002971,28135.0,6931.0,327.0,94.550,4543943.0


### Coverage of Self Assembled Isolates

- **coverage** of the sequences were calculated by mapping short reads to corresponding reference genome with **BWA** and calculated with **Qualimap**

- get coverage info from qualimap analysis (mind that this still **includes the isolates with low genome fraction** filtered out from last section)

In [28]:
# picardeau multiqc qualimap report
picardeau_qualimap = pd.read_csv(path + "/picardeau/multiqc_qualimap_general_stats_picardeau.txt",sep="\t")
picardeau_cov_filtered = picardeau_qualimap[(picardeau_qualimap["QualiMap_mqc-generalstats-median_coverage"]>= 30) # median coverage at least 30X
                                            & (picardeau_qualimap["QualiMap_mqc-generalstats-30_x_pc"] >=50)]# at least 50% of the scaffold is >= 30X
picardeau_cov_filtered

Unnamed: 0,Sample,QualiMap_mqc-generalstats-general_error_rate,QualiMap_mqc-generalstats-percentage_aligned,QualiMap_mqc-generalstats-10_x_pc,QualiMap_mqc-generalstats-median_coverage,QualiMap_mqc-generalstats-30_x_pc,QualiMap_mqc-generalstats-median_insert_size,QualiMap_mqc-generalstats-mapped_reads,QualiMap_mqc-generalstats-50_x_pc,QualiMap_mqc-generalstats-1_x_pc,QualiMap_mqc-generalstats-total_reads,QualiMap_mqc-generalstats-avg_gc,QualiMap_mqc-generalstats-5_x_pc
0,SAMEA104369441,1.19,97.130549,93.273308,76,84.804307,378,2257888.0,74.276766,95.384636,2324591.0,36.782587,94.734739
1,SAMEA5168034,0.66,99.704422,99.995536,102,99.932548,365,3278749.0,98.632420,99.999715,3288469.0,41.350544,99.997792
2,SAMEA5168035,1.54,91.352358,95.916279,167,91.875211,407,4966275.0,87.656844,97.260021,5436395.0,42.158659,96.706989
3,SAMEA5168036,0.71,98.961612,95.792693,63,86.009193,451,1996791.0,66.917538,97.012331,2017743.0,37.932445,96.633234
4,SAMEA5168037,0.37,91.771396,99.973402,51,94.568599,456,1484629.0,54.684156,99.997824,1617747.0,39.820175,99.996963
5,SAMEA5168038,0.81,97.604009,99.992307,194,99.613079,342,5451301.0,98.271039,99.994888,5585120.0,40.122003,99.993521
6,SAMEA5168039,1.57,94.062897,91.851552,59,77.713513,404,2138139.0,59.455797,94.939970,2273095.0,37.548748,93.912562
7,SAMEA5168040,0.87,98.643940,96.444525,53,81.005480,428,1954969.0,54.498552,97.609115,1981844.0,37.368930,97.268745
8,SAMEA5168041,2.22,88.936562,92.198779,80,89.337659,415,2308931.0,80.533916,94.252097,2596155.0,41.441789,92.796167
10,SAMEA5168043,2.26,90.588706,92.298098,112,89.419912,386,3118932.0,84.180990,94.342362,3442959.0,41.674897,92.950754


In [13]:
all_self_assemblies.to_csv(path + "/all_self_assemblies_filtered.csv")