In [1]:
import collections
import os
import json
import logging
import string
import re

from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import networkx as nx

if os.getcwd().endswith('notebook'):
    os.chdir('..')

In [2]:
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)

In [3]:
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')

## Load PFAM file

In [30]:
def load_pfam_file(path, skiplines=4, n_cols=19):
    p = r'\s+'.join([r'([^\s]+)' for _ in range(n_cols)])
    pattern = f'^{p}$'
    
    line_nb = 0
    with open(path, 'r') as f:
        for line in f:
            line_nb += 1
            if line_nb < skiplines:
                continue

            m = re.match(pattern, line)
            row = [m[i+1] for i in range(n_cols)]
            
            first_el = row[0]
            
            a, genome_accession = tuple(first_el.split('$'))
            _, protein_id = tuple(a.split('@'))
            protein_label = row[-1] if row[-1] != '-' else None
            
            pfam_query = row[2]
            pfam_accession = row[3]
            
            data_row = [
                genome_accession,
                protein_id,
                pfam_query,
                pfam_accession,
                protein_label,
            ]
            print(data_row)
            
            if line_nb > 10:
                break

In [31]:
path = os.path.join(os.getcwd(), 'data/Large_EBMC_Bact_DB/concat_bact95_hclust05_vs_Pfam_5_15_tbloutm.txt')
load_pfam_file(path)

['GCA_900115545.1', 'SFO79740.1', 'DUF3492', 'PF11997.8', 'Glycosyltransferase-involved-in-cell-wall-bisynthesis']
['GCA_002298975.1', 'DBMV01000004.1-16', 'DUF3492', 'PF11997.8', None]
['GCA_000620465.1', 'KK211140.1-377', 'DUF3492', 'PF11997.8', None]
['GCA_000955795.1', 'KJJ94947.1', 'DUF3492', 'PF11997.8', 'glycosyl-transferase-family-1']
['GCA_003851545.1', 'GCB03505.1', 'DUF3492', 'PF11997.8', 'pellicle/biofilm-biosynthesis-glycosyltransferase-PelF']
['GCA_002332825.1', 'DDBR01000017.1-226', 'DUF3492', 'PF11997.8', None]
['GCA_000020205.1', 'ACD27603.1', 'DUF3492', 'PF11997.8', 'glycosyl-transferase-group-1']
['GCA_001634945.1', 'KZY35435.1', 'DUF3492', 'PF11997.8', 'glycosyl-transferase-family-1']


In [19]:
sequence_query = """
select metadata_json from sequences where assembly_accession = 'GCA_000008545.1'
"""
df = pd.read_sql(sequence_query, engine)
metadata_json = df['metadata_json'].values
len(metadata_json)

protein_id_set = set()
for m in metadata_json:
    if m is not None:
        metadata = json.loads(m)
        protein_id = metadata.get('protein_id')
        if protein_id is not None:
            protein_id_set.add(protein_id.strip())
            
len(protein_id_set)

1846

In [23]:
'AAD36476.1' in protein_id_set

True