## LTEE-FBA-Jenga.ipynb by Rohan Maddamsetti

See documentation at: 
https://cobrapy.readthedocs.io/en/latest/  

In [1]:
import matplotlib as plt
import cobra.test
import cobra
import pandas as pd
import numpy as np
from scipy import stats
from os import path
import random

In [2]:
## use Gurobi. Necessary for constructing minimal genomes!
cobra_config = cobra.Configuration()
cobra_config.solver = "gurobi"
cobra_config

Attribute,Description,Value
solver,Mathematical optimization solver,gurobi
tolerance,"General solver tolerance (feasibility, integrality, etc.)",1e-07
lower_bound,Default reaction lower bound,-1000.0
upper_bound,Default reaction upper bound,1000.0
processes,Number of parallel processes,3
cache_directory,Path for the model cache,/Users/Rohandinho/Library/Caches/cobrapy
max_cache_size,Maximum cache size in bytes,104857600
cache_expiration,Model cache expiration time in seconds (if any),


## Functions.

In [3]:
def make_dict_of_KO_strains(anc_model, gene_vec=[]):
    deletion_cobra_models = {}
    
    gene_str_list = [str(gene_obj) for gene_obj in anc_model.genes]
    if len(gene_vec): ## then filter for the genes (strings) in gene_vec.
        gene_str_list = [x for x in gene_str_list if x in gene_vec]
        
    for genestr in gene_str_list:
        cur_KO_model = anc_model.copy()
        cur_KO_model.id = '_'.join([anc_model.id, genestr, "knockout"])
        cur_gene = cur_KO_model.genes.get_by_id(genestr)
        cobra.manipulation.delete_model_genes(cur_KO_model, cur_gene)
        deletion_cobra_models[genestr] = cur_KO_model
    return deletion_cobra_models

In [4]:
def generate_LTEE_clone_cobra_models(KOed_genes_df, basic_model, using_locus_tag=False):
    
    LTEE_cobra_models = {}
    nonmutator_pops = ["Ara-5", "Ara-6", "Ara+1", "Ara+2", "Ara+4", "Ara+5"]
    mutator_pops = ["Ara-1", "Ara-2", "Ara-3", "Ara-4", "Ara+3", "Ara+6"]
    LTEE_pop_vec = nonmutator_pops + mutator_pops
    model_genes = [x.id for x in basic_model.genes]
    
    for LTEE_pop in LTEE_pop_vec:
        cur_KO_model = basic_model.copy()
        cur_KO_model.id = LTEE_pop + "_50K_A_clone"
        is_cur_pop = (KOed_genes_df["Population"] == LTEE_pop)
        
        if using_locus_tag:
            KOed_genes = [x for x in KOed_genes_df[is_cur_pop].locus_tag]
        else:
            KOed_genes = [x for x in KOed_genes_df[is_cur_pop].blattner]
        
        genes_to_remove = [cur_KO_model.genes.get_by_id(x) for x in KOed_genes if x in model_genes]
        if genes_to_remove:
            cobra.manipulation.delete_model_genes(cur_KO_model, genes_to_remove)
        
        LTEE_cobra_models[LTEE_pop] = cur_KO_model
    
    return LTEE_cobra_models

## Basic metabolic models to play with. Default objective is to maximize biomass.

In [5]:
BiGG_model_dir = "../data/BiGG-models"

# the E. coli K-12 iJO1366 model: this is the best curated complete E. coli model.
K12_model = cobra.io.load_json_model(path.join(BiGG_model_dir, "iJO1366.json"))
K12_model.id = "K12"

# the E. coli REL606 iECB_1328 model: most relevant to LTEE, but has stochiometric inconsistencies
## in hydrogen/proton conservation.
REL606_model = cobra.io.load_json_model(path.join(BiGG_model_dir, "iECB_1328.json"))
REL606_model.id = "REL606"

## The simplest well-curated model: E. coli core metabolism.
core_model = cobra.io.load_json_model(path.join(BiGG_model_dir, "e_coli_core.json"))
core_model.id = "Ecoli-core"

Set parameter Username
Academic license - for non-commercial use only - expires 2022-12-16


In [6]:
## we are simulating media in which glucose import is limiting.
## Add an excess of thiamine to simulate DM25.
K12_medium = K12_model.medium
K12_medium['EX_thm_e'] = 1000.0
K12_model.medium = K12_medium
REL606_medium = REL606_model.medium
REL606_medium['EX_thm_e'] = 1000.0
REL606_model.medium = REL606_medium

print(K12_model.medium)
print(REL606_model.medium)
print(core_model.medium)

{'EX_co2_e': 1000.0, 'EX_cobalt2_e': 1000.0, 'EX_glc__D_e': 10.0, 'EX_h_e': 1000.0, 'EX_h2o_e': 1000.0, 'EX_k_e': 1000.0, 'EX_cu2_e': 1000.0, 'EX_mg2_e': 1000.0, 'EX_mn2_e': 1000.0, 'EX_mobd_e': 1000.0, 'EX_na1_e': 1000.0, 'EX_nh4_e': 1000.0, 'EX_ca2_e': 1000.0, 'EX_cbl1_e': 0.01, 'EX_ni2_e': 1000.0, 'EX_o2_e': 1000.0, 'EX_cl_e': 1000.0, 'EX_pi_e': 1000.0, 'EX_zn2_e': 1000.0, 'EX_sel_e': 1000.0, 'EX_slnt_e': 1000.0, 'EX_so4_e': 1000.0, 'EX_thm_e': 1000.0, 'EX_tungs_e': 1000.0, 'EX_fe2_e': 1000.0, 'EX_fe3_e': 1000.0}
{'EX_ca2_e': 1000.0, 'EX_cbl1_e': 0.01, 'EX_cl_e': 1000.0, 'EX_co2_e': 1000.0, 'EX_cobalt2_e': 1000.0, 'EX_cu2_e': 1000.0, 'EX_fe2_e': 1000.0, 'EX_fe3_e': 1000.0, 'EX_mg2_e': 1000.0, 'EX_mn2_e': 1000.0, 'EX_mobd_e': 1000.0, 'EX_na1_e': 1000.0, 'EX_nh4_e': 1000.0, 'EX_ni2_e': 1000.0, 'EX_glc__D_e': 10.0, 'EX_o2_e': 1000.0, 'EX_tungs_e': 1000.0, 'EX_pi_e': 1000.0, 'EX_zn2_e': 1000.0, 'EX_sel_e': 1000.0, 'EX_slnt_e': 1000.0, 'EX_so4_e': 1000.0, 'EX_thm_e': 1000.0, 'EX_h_e': 10

In [21]:
nonmutator_pops = ["Ara-5", "Ara-6", "Ara+1", "Ara+2", "Ara+4", "Ara+5"]
mutator_pops = ["Ara-1", "Ara-2", "Ara-3", "Ara-4", "Ara+3", "Ara+6"]
LTEE_pop_vec = nonmutator_pops + mutator_pops

## Question 2: Measure fitness of 50K A clones using the whole genome metabolic model.

In [22]:
inactive_genes_in_50K_A_clones = pd.read_csv(
    "../results/metabolic-enzymes/inactive-genes-in-LTEE-50K-A-clones.csv",)
inactive_genes_in_50K_A_clones

Unnamed: 0,Population,locus_tag,Gene,blattner,gene_length,product,start,end,strand
0,Ara-1,ECB_00100,mutT,b0099,390,nucleoside triphosphate pyrophosphohydrolase m...,113848,114237,1
1,Ara-1,ECB_00259,ykgI,b0303,237,hypothetical protein,290980,291216,-1
2,Ara-1,ECB_00320,ECB_00320,b0370,198,hypothetical protein,358825,359022,-1
3,Ara-1,ECB_00429,ybaL,b0478,1677,predicted transporter with NADP-binding Rossma...,473629,475305,-1
4,Ara-1,ECB_00460,ybbV,b0510,279,hypothetical protein,509662,509940,1
...,...,...,...,...,...,...,...,...,...
2175,Ara+5,ECB_03938,yjcF,b4066,1293,hypothetical protein,4260716,4262008,-1
2176,Ara+5,ECB_04094,yzfA,b4223,183,hypothetical protein,4431733,4431915,-1
2177,Ara+5,ECB_04101,mpl,b4233,1374,UDP-N-acetylmuramate:L-alanyl-gamma-D-glutamyl...,4439035,4440408,1
2178,Ara+5,ECB_04123,yjgN,b4257,1197,conserved inner membrane protein,4462763,4463959,1


In [24]:
## Generate models from K-12.
iJO1366_LTEE_models = generate_LTEE_clone_cobra_models(no_expression_genes_in_50K_A_clones, K12_model)
## Generate models from REL606.
iECB_1328_LTEE_models = generate_LTEE_clone_cobra_models(no_expression_genes_in_50K_A_clones, REL606_model, using_locus_tag=True)

Read LP format model from file /var/folders/v8/l0zk07vs7gs5zn3nr6f6qj5r0000gn/T/tmpfdq7fbgd.lp
Reading time = 0.02 seconds
: 1805 rows, 5166 columns, 20366 nonzeros
Read LP format model from file /var/folders/v8/l0zk07vs7gs5zn3nr6f6qj5r0000gn/T/tmp0th_i4gj.lp
Reading time = 0.02 seconds
: 1805 rows, 5166 columns, 20366 nonzeros
Read LP format model from file /var/folders/v8/l0zk07vs7gs5zn3nr6f6qj5r0000gn/T/tmpmjo4jvxn.lp
Reading time = 0.02 seconds
: 1805 rows, 5166 columns, 20366 nonzeros
Read LP format model from file /var/folders/v8/l0zk07vs7gs5zn3nr6f6qj5r0000gn/T/tmpoiytgefm.lp
Reading time = 0.02 seconds
: 1805 rows, 5166 columns, 20366 nonzeros
Read LP format model from file /var/folders/v8/l0zk07vs7gs5zn3nr6f6qj5r0000gn/T/tmp7z90j9a0.lp
Reading time = 0.02 seconds
: 1805 rows, 5166 columns, 20366 nonzeros
Read LP format model from file /var/folders/v8/l0zk07vs7gs5zn3nr6f6qj5r0000gn/T/tmp7yre74di.lp
Reading time = 0.02 seconds
: 1805 rows, 5166 columns, 20366 nonzeros
Read LP fo

## Cross-check the list of essential genes in the K12 and REL606 models
## with the list of genes knocked out in the 50K LTEE clones.

Cool finding: ALL of these cases involve gene AMPLIFICATIONS and then later deletions of one copy,  
with the exception of _thiD_ and _thiG_, which are involved in thiamine biosynthesis, which is supplied in DM25!
Therefore, it is critical to take gene amplifications into account when scoring gene deletions.  

ALSO, are LTEE strains evolving thiamine auxotrophy?

In [12]:
K12_essential_genes = [str(x) for x in cobra.flux_analysis.find_essential_genes(K12_model)]
REL606_essential_genes = [str(x) for x in cobra.flux_analysis.find_essential_genes(REL606_model)]

In [13]:
print(K12_essential_genes)
print(len(K12_essential_genes))
print()
print(REL606_essential_genes)
print(len(REL606_essential_genes))
## quickly generate an input file for STIMS to see if there is purifying selection on these genes in the LTEE.
REL606_ids = pd.read_csv("../results/REL606_IDs.csv")
REL606_metabolic_essential = REL606_ids[REL606_ids.locus_tag.isin(REL606_essential_genes)]
REL606_metabolic_essential.to_csv("../results/metabolic-enzymes/for_initial_STIMS_test.csv",sep=",")

['b3939', 'b1208', 'b0386', 'b4177', 'b2020', 'b2585', 'b2022', 'b0134', 'b0004', 'b3804', 'b2687', 'b0009', 'b0774', 'b2751', 'b0475', 'b1069', 's0001', 'b3997', 'b3634', 'b0154', 'b2752', 'b3729', 'b0785', 'b0826', 'b0159', 'b2599', 'b2600', 'b2329', 'b0524', 'b0133', 'b1098', 'b3805', 'b4039', 'b2472', 'b0087', 'b3941', 'b3196', 'b2763', 'b1094', 'b2762', 'b3648', 'b4040', 'b3607', 'b2764', 'b1263', 'b1264', 'b0784', 'b3960', 'b3187', 'b3018', 'b0776', 'b3771', 'b3958', 'b0907', 'b3172', 'b0103', 'b0142', 'b0052', 'b3633', 'b1281', 'b0182', 'b3608', 'b4006', 'b3774', 'b2564', 'b0003', 'b4013', 'b0522', 'b3843', 'b0523', 'b2311', 'b2312', 'b2838', 'b0185', 'b0173', 'b2320', 'b2021', 'b0414', 'b0420', 'b0031', 'b3967', 'b3809', 'b2316', 'b0175', 'b3642', 'b3730', 'b3041', 'b0915', 'b3256', 'b0778', 'b3040', 'b3255', 'b1210', 'b3368', 'b0750', 'b2478', 'b2750', 'b2400', 'b1662', 'b1131', 'b0415', 'b3176', 'b0639', 'b2507', 'b1260', 'b0166', 'b2315', 'b1288', 'b1261', 'b3433', 'b2615', 

In [15]:
inactive_genes_in_50K_A_clones[inactive_genes_in_50K_A_clones.blattner.isin(K12_essential_genes)]

Unnamed: 0,Population,Gene,locus_tag,blattner,gene_length,product,start,end,strand
387,Ara-3,aldA,ECB_01370,b1415,1440,aldehyde dehydrogenase A NAD-linked,1457563,1459002,1


In [16]:
inactive_genes_in_50K_A_clones[inactive_genes_in_50K_A_clones.locus_tag.isin(REL606_essential_genes)]

Unnamed: 0,Population,Gene,locus_tag,blattner,gene_length,product,start,end,strand
387,Ara-3,aldA,ECB_01370,b1415,1440,aldehyde dehydrogenase A NAD-linked,1457563,1459002,1
1373,Ara+2,tktB,ECB_02356,b2465,2004,transketolase 2 thiamin-binding,2500464,2502467,1


## TODO: Analyze minimal genomes. See Pal et al. (2006) for guidance.  
Examine variation in essential genes, based on the definition in the source code for cobra.flux_analysis.variability.