## Add missing lineages

Initially, the `tax_lineage` table only contained lineage information for nodes with rank = 'species', so there were taxon nodes in the `tax_nodes` table that didn't have any lineage information despite NCBI having that data.

This notebook is based on Artem's `230410_Analyze_numuass.ipynb`. This was used for a single time update of lineage data in pSQL, then an update of all Taxon nodes in the Neo4j graph db.

In [1]:
# Notebook config
import sys
if '../' not in sys.path:
    sys.path.append("../")
%load_ext dotenv
%dotenv

from queries import serratus_queries, graph_queries
import pandas as pd
from datasources import psql
import psycopg2
import os


In [5]:
taxon_nodes = serratus_queries.get_taxon_df()

Reading local cached file /mnt/graphdata/query_cache/sql_taxon_nodes.csv


In [3]:
# for each taxon node, get the lineage
!awk -F "\"*,\"*" '{print $2}' /mnt/graphdata/query_cache/sql/taxon_nodes.csv \
    | ~/workspace/misc/taxons/taxonkit reformat -I 1 -f '"{k}","{p}","{o}","{f}","{g}","{s}"' \
    | sed 's/\t/,/g' - \
    > /mnt/graphdata/ncbi-data/lineage_dump.csv

!sed -i "1s/.*/tax_id,tax_kingdom,tax_phylum,tax_order,tax_family,tax_genus,tax_species/" /mnt/graphdata/ncbi-data/lineage_dump.csv


16:32:27.474 [33m[WARN][0m invalid TaxId: tax_id
16:32:27.498 [33m[WARN][0m taxid 11103 was merged into 3052230
16:32:27.498 [33m[WARN][0m taxid 11191 was merged into 3052731
16:32:27.498 [33m[WARN][0m taxid 11269 was merged into 3052505
16:32:27.498 [33m[WARN][0m taxid 11215 was merged into 3052729
16:32:27.498 [33m[WARN][0m taxid 11232 was merged into 3052342
16:32:27.499 [33m[WARN][0m taxid 11620 was merged into 3052310
16:32:27.499 [33m[WARN][0m taxid 11623 was merged into 3052303
16:32:27.499 [33m[WARN][0m taxid 11628 was merged into 3052317
16:32:27.499 [33m[WARN][0m taxid 11629 was merged into 3052320
16:32:27.499 [33m[WARN][0m taxid 11631 was merged into 3052328
16:32:27.500 [33m[WARN][0m taxid 12330 was merged into 3052767
16:32:27.500 [33m[WARN][0m taxid 12331 was merged into 3052763
16:32:27.500 [33m[WARN][0m taxid 12332 was merged into 3052764
16:32:27.509 [33m[WARN][0m taxid 31604 was merged into 3052343
16:32:27.515 [33m[WARN][0m taxid 3530

In [6]:
lineage_df = pd.read_csv('/mnt/graphdata/ncbi-data/lineage_dump.csv')
lineage_df.head()

Unnamed: 0,tax_id,tax_kingdom,tax_phylum,tax_order,tax_family,tax_genus,tax_species
0,1,,,,,,
1,2,Bacteria,,,,,
2,6,Bacteria,Pseudomonadota,Hyphomicrobiales,Xanthobacteraceae,Azorhizobium,
3,7,Bacteria,Pseudomonadota,Hyphomicrobiales,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans
4,9,Bacteria,Pseudomonadota,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola


In [5]:
def get_write_connection():
    return psycopg2.connect(
        database="summary",
        host="serratus-aurora-20210406.cluster-ro-ccz9y6yshbls.us-east-1.rds.amazonaws.com",
        user=os.environ.get('SQL_WRITE_USER'),
        password=os.environ.get('SQL_WRITE_PASSWORD'),
        port="5432")


`row_id` is required but is not a serial value, instead we alter table to use a sequence for `row_id`

```sql
CREATE SEQUENCE row_id_seq;
ALTER TABLE public.tax_lineage ALTER COLUMN row_id SET DEFAULT nextval('row_id_seq');
ALTER TABLE public.tax_lineage ALTER COLUMN row_id SET NOT NULL;
ALTER SEQUENCE row_id_seq OWNED BY public.tax_lineage.row_id;
SELECT MAX(row_id) FROM public.tax_lineage;
-- Manually copy value (denoted by $MAX_ROW_ID)
SELECT setval('row_id_seq', $MAX_ROW_ID);
```

In [7]:
# 2050294 nodes
# 2501873 lineages

def get_missing_tax_ids():
    conn = psql.get_serratus_connection()
    cursor = conn.cursor()
    query = """
        SELECT tax_id 
        FROM   tax_nodes l 
        WHERE  NOT EXISTS (
            SELECT  -- SELECT list mostly irrelevant; can just be empty in Postgres
            FROM   tax_lineage
            WHERE  tax_id = l.tax_id
        );
    """
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns=['tax_id'])
    cursor.close()
    conn.close()
    return df

missing_tax_ids = get_missing_tax_ids()
missing_lineages = missing_tax_ids.merge(lineage_df, on='tax_id', how='left')
missing_lineages = missing_lineages.replace({pd.np.nan: None})
print(lineage_df.shape)
print(missing_tax_ids.shape)
print(missing_lineages.shape)
print(missing_lineages.head())

(2501873, 7)
(0, 1)
(0, 7)
Empty DataFrame
Columns: [tax_id, tax_kingdom, tax_phylum, tax_order, tax_family, tax_genus, tax_species]
Index: []


  missing_lineages = missing_lineages.replace({pd.np.nan: None})


In [28]:
# Workaround for row_id pkey which is not autoincrementing serial value
# TODO: alter table so that row_id is an autoincrementing serial Integer
def get_max_row_id():
    conn = psql.get_serratus_connection()
    cursor = conn.cursor()
    query = """
        SELECT max(CAST(row_id as Int)) FROM tax_lineage;
    """
    cursor.execute(query)
    out = cursor.fetchone()[0]
    cursor.close()
    conn.close()
    return int(out)

max_row_id = get_max_row_id()
print(max_row_id)

2050294


In [42]:
conn = get_write_connection()
cursor = conn.cursor()

errors = []
cur_row_id = max_row_id + 1
for row in missing_lineages.reset_index().to_dict('rows'):
    try:
        query = """
            INSERT into public.tax_lineage(row_id, tax_id, tax_kingdom, tax_phylum, tax_order, tax_family, tax_genus, tax_species) 
            VALUES(%s, %s, %s, %s, %s, %s, %s, %s)
        """  
        args = (
            cur_row_id,
            row['tax_id'], row['tax_kingdom'], row['tax_phylum'], 
            row['tax_order'], row['tax_family'], row['tax_genus'], 
            row['tax_species']
        )
        out = cursor.execute(query, args)
        out = conn.commit()
        cur_row_id += 1
    except Exception as e:
        errors.append(row)
        print(e)
        conn.rollback()

cursor.close()
conn.close()

assert len(errors) == 0

  for row in missing_lineages.reset_index().to_dict('rows'):


In [4]:
!rm /mnt/graphdata/query_cache/sql/taxon_nodes.csv

In [8]:
df_taxon = serratus_queries.get_taxon_df()

Reading local cached file /mnt/graphdata/query_cache/sql_taxon_nodes.csv


In [9]:
graph_queries.add_taxon_nodes(df_taxon)

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
