In [86]:
import collections
import os
import json
import logging

from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import networkx as nx

if os.getcwd().endswith('notebook'):
    os.chdir('..')

from rna_learn.alphabet import ALPHABET_DNA, CODON_REDUNDANCY

In [87]:
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)

In [88]:
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')

## Load codon bias

In [89]:
temperature_query = """
select s.assembly_accession, t.species_taxid, t.growth_tmp from assembly_source as s
left join (
    select species_taxid, growth_tmp from species_traits
) as t
on s.species_taxid = t.species_taxid
"""
temperatures_df = pd.read_sql(temperature_query, engine)
species_temperatures_df = temperatures_df[
    ['species_taxid', 'growth_tmp']].groupby('species_taxid').first().reset_index()

In [90]:
species_codon_ratios_path = os.path.join(os.getcwd(), 'data/species_codon_ratios.csv')

q = """
select 
    genome_size, species_taxid, species, genus, family, "order", class, phylum, superkingdom 
from species_traits
"""

species_codon_df = pd.merge(
    pd.read_csv(species_codon_ratios_path),
    species_temperatures_df,
    how='inner',
    on='species_taxid',
)
species_codon_df = pd.merge(
    species_codon_df,
    pd.read_sql(q, engine),
    how='inner',
    on='species_taxid',
)
species_codon_df.head()

Unnamed: 0,species_taxid,in_test_set,AAA_ratio,AAG_ratio,AAT_ratio,AAC_ratio,ACT_ratio,ACC_ratio,ACA_ratio,ACG_ratio,...,TTT_ratio,growth_tmp,genome_size,species,genus,family,order,class,phylum,superkingdom
0,7,True,0.08255,0.91745,0.327724,0.672276,0.024511,0.575353,0.036262,0.363875,...,0.10282,30.0,5369771.5,Azorhizobium caulinodans,Azorhizobium,Xanthobacteraceae,Rhizobiales,Alphaproteobacteria,Proteobacteria,Bacteria
1,9,False,0.919627,0.080373,0.858168,0.141832,0.454569,0.050026,0.446018,0.049387,...,0.919776,24.0,601699.243,Buchnera aphidicola,Buchnera,Erwiniaceae,Enterobacterales,Gammaproteobacteria,Proteobacteria,Bacteria
2,11,True,0.007302,0.992698,0.01073,0.98927,0.008586,0.425527,0.015789,0.550098,...,0.005406,26.0,3526440.8,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinobacteria,Actinobacteria,Bacteria
3,14,True,0.664866,0.335134,0.775571,0.224429,0.458203,0.188052,0.304183,0.049563,...,0.810559,74.15,1959987.6,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,Dictyoglomi,Bacteria
4,19,True,0.55181,0.44819,0.440559,0.559441,0.099134,0.591478,0.097796,0.211592,...,0.497451,30.0,3722544.667,Pelobacter carbinolicus,Pelobacter,Desulfuromonadaceae,Desulfuromonadales,Deltaproteobacteria,Proteobacteria,Bacteria


In [91]:
traits_df = pd.read_sql('select * from species_traits', engine)
traits_df.columns

Index(['species_taxid', 'species', 'genus', 'family', 'order', 'class',
       'phylum', 'superkingdom', 'gram_stain', 'metabolism', 'pathways',
       'carbon_substrates', 'sporulation', 'motility', 'range_tmp',
       'range_salinity', 'cell_shape', 'isolation_source', 'd1_lo', 'd1_up',
       'd2_lo', 'd2_up', 'doubling_h', 'genome_size', 'gc_content',
       'coding_genes', 'optimum_tmp', 'optimum_ph', 'growth_tmp',
       'rRNA16S_genes', 'tRNA_genes', 'gram_stain.count', 'metabolism.count',
       'pathways.count', 'carbon_substrates.count', 'sporulation.count',
       'motility.count', 'range_tmp.count', 'range_salinity.count',
       'cell_shape.count', 'isolation_source.count', 'gram_stain.prop',
       'metabolism.prop', 'pathways.prop', 'carbon_substrates.prop',
       'sporulation.prop', 'motility.prop', 'range_tmp.prop',
       'range_salinity.prop', 'cell_shape.prop', 'isolation_source.prop',
       'd1_lo.count', 'd1_up.count', 'd2_lo.count', 'd2_up.count',
       'doubl

In [92]:
traits_df[[
    'gram_stain', 'metabolism', 'pathways',
    'carbon_substrates', 'sporulation', 'motility', 
    'range_salinity', 'cell_shape', 'isolation_source', 
    'doubling_h', 'optimum_ph', 'd1_lo', 'd1_up',
    'd2_lo', 'd2_up'
]].head()

Unnamed: 0,gram_stain,metabolism,pathways,carbon_substrates,sporulation,motility,range_salinity,cell_shape,isolation_source,doubling_h,optimum_ph,d1_lo,d1_up,d2_lo,d2_up
0,negative,aerobic,,,no,,,,host_plant,,,,,,
1,negative,,,,no,,,coccobacillus,host_animal_ectotherm,35.4,,3.0,,,
2,positive,facultative,cellulose_degradation,,no,yes,,bacillus,host_animal_endotherm,,,,,,
3,negative,anaerobic,cellulose_degradation,,no,no,,bacillus,water_hotspring,2.47,,0.4,0.6,5.0,20.0
4,negative,anaerobic,"sulfur_reduction, iron_reduction, fermentation",,no,,,bacillus,petroleum,,,0.5,0.7,1.2,3.0


In [114]:
traits_df[['range_salinity', 'species_taxid']].groupby('range_salinity').count()

Unnamed: 0_level_0,species_taxid
range_salinity,Unnamed: 1_level_1
extreme-halophilic,17
halophilic,1
halotolerant,3
moderate-halophilic,125
non-halophilic,160
stenohaline,1


In [115]:
traits_df['tRNA_genes'].describe()

count    2007.000000
mean       58.387014
std        19.089969
min        17.000000
25%        46.000000
50%        53.000000
75%        66.500000
max       177.667000
Name: tRNA_genes, dtype: float64

## Regression

In [82]:
ratio_columns = [c for c in species_codon_df.columns if c.endswith('_ratio')]

trait_name = 'genome_size'
df = species_codon_df[species_codon_df[trait_name].notnull()].reset_index(drop=True)

x_train = df[~df['in_test_set']][ratio_columns].values
y_train = df[~df['in_test_set']][trait_name].values

x_test = df[df['in_test_set']][ratio_columns].values
y_test = df[df['in_test_set']][trait_name].values

print(len(x_train), len(x_test))

1953 530


In [83]:
def fit_linear_regression(y_actual, x):
    X = sm.add_constant(x)
    model = sm.OLS(y_actual, X)
    results = model.fit()
    y_fit = results.predict(X)
    return results, y_fit

In [84]:
results, _ = fit_linear_regression(y_train, x_train)

In [85]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.507
Method:,Least Squares,F-statistic:,47.77
Date:,"Thu, 12 Nov 2020",Prob (F-statistic):,3.11e-267
Time:,14:43:43,Log-Likelihood:,-30355.0
No. Observations:,1953,AIC:,60800.0
Df Residuals:,1909,BIC:,61040.0
Df Model:,43,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.365e+05,2.28e+04,19.162,0.000,3.92e+05,4.81e+05
x1,-2.434e+05,2.71e+05,-0.899,0.369,-7.75e+05,2.88e+05
x2,6.799e+05,2.69e+05,2.532,0.011,1.53e+05,1.21e+06
x3,-9.194e+04,4.68e+05,-0.196,0.844,-1.01e+06,8.27e+05
x4,5.284e+05,4.66e+05,1.133,0.257,-3.86e+05,1.44e+06
x5,-1.066e+06,8.15e+05,-1.308,0.191,-2.66e+06,5.32e+05
x6,5.671e+05,6.28e+05,0.903,0.366,-6.64e+05,1.8e+06
x7,1.951e+06,6.63e+05,2.943,0.003,6.51e+05,3.25e+06
x8,-1.016e+06,6.57e+05,-1.546,0.122,-2.3e+06,2.73e+05

0,1,2,3
Omnibus:,438.61,Durbin-Watson:,1.589
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1506.425
Skew:,1.09,Prob(JB):,0.0
Kurtosis:,6.709,Cond. No.,4.14e+16
