# ZFIN-8499 Reports


# New Download File

UniProt has released 2023_01, the first release of 2023. We are expecting this to reflect the changes from our previews in ZFIN-8376 which included a to_keep.csv file and to_delete.csv file.

As a first step, I ran our preload which downloads from https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/ and filters for zebrafish records and puts them in pre_zfin.dat.

To compare the to_keep.csv from before and the pre_zfin.dat files, I pulled just the uniprot IDs from each file and put them in the db.sqlite file in tables named "to_keep_ids" and "pre_zfin_ids" respectively.


# Preface 

## Section A: Comparisons

### i) pre_zfin count
pre_zfin has 52,331 unique uniprot IDs.

### ii) to_keep count
to_keep has 51,823 unique uniprot IDs

### iii) IDs in pre_zfin, but not in to_keep
There are 3,331

### iv) IDs in to_keep, but not in pre_zfin
There are 2,823



## Next Steps

- Import tables from 8395 report and generate the same reports for the new data
- Convert pre_zfin.dat file to same format as to_keep.csv


# Queries supporting these results


## Initialize Database

In [1]:
!cp inputs/db.sqlite .

In [2]:
%reload_ext sql


In [3]:
%%sql
sqlite:///db.sqlite

## Database Queries

### Ai

In [4]:
%%sql

select count(distinct id) from pre_zfin_ids;

 * sqlite:///db.sqlite
Done.


count(distinct id)
52331


### Aii

In [5]:
%%sql

select count(distinct id) from to_keep_ids;

 * sqlite:///db.sqlite
Done.


count(distinct id)
51823


### Aiii

In [6]:
%%sql

create table "Aiii" as 
select * from pre_zfin_ids where id not in (select * from to_keep_ids);
select count(*) from Aiii;


 * sqlite:///db.sqlite
Done.
Done.


count(*)
3331


### Aiv

In [7]:
%%sql

create table "Aiv" as
select * from to_keep_ids where id not in (select * from pre_zfin_ids);
select count(*) from Aiv;

 * sqlite:///db.sqlite
Done.
Done.


count(*)
2823


## Reports from 8395


In [9]:
!rm -f zfin-db-slice.db
!curl -L -o zfin-db-slice.db.gz https://github.com/rtaylorzfin/8395-uniprot-notebook/raw/main/db/zfin-db-slice.db.gz
!gunzip -f zfin-db-slice.db.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 24.3M  100 24.3M    0     0  25.2M      0 --:--:-- --:--:-- --:--:-- 46.9M


In [None]:
%reload_ext sql

In [None]:
%%sql
sqlite:///zfin-db-slice.db

## Database Queries

Create a copy of to_keep for modifying

In [4]:
%%sql

-- create copy of to_keep from uniprot
create table to_keep_m as select * from to_keep;

-- Remove the version from the refseq numbers:
update to_keep_m set primary_id = substr(primary_id, 0, LENGTH(rtrim(primary_id, '0123456789'))) where database = 'RefSeq' and primary_id like '%.%';
update to_keep_m set secondary_id = substr(secondary_id, 0, LENGTH(rtrim(secondary_id, '0123456789'))) where database = 'RefSeq' and secondary_id like '%.%';
create index primary_id_key on to_keep_m (primary_id);
create index secondary_id_key on to_keep_m (secondary_id);

-- Remove the version from ensdarg in uniprot2ensembl
update uniprot2ensembl set ensdarg = substr(ensdarg, 0, LENGTH(rtrim(ensdarg, '0123456789'))) where ensdarg like '%.%';

-- Remove the version from refseq in refseq2ncbi
update refseq2ncbi set refseq = substr(refseq, 0, LENGTH(rtrim(refseq, '0123456789'))) where refseq like '%.%';


 * sqlite:///zfin-db-slice.db
Done.
62344 rows affected.
62344 rows affected.
Done.
Done.
45704 rows affected.
55485 rows affected.


[]

In [5]:
%%sql

-- Move the comma delimited dbXrefs from the ncbi_map table to its own table for ease of querying
-- See: https://stackoverflow.com/questions/51571854/how-to-split-comma-delimited-values-into-multiple-rows-using-sqlite
create table ncbi_xrefs as
WITH RECURSIVE split(id, value, rest) AS (
   SELECT GeneID, '', dbXrefs || '|' FROM ncbi_map
   UNION ALL SELECT
   id,
   substr(rest, 0, instr(rest, '|')),
   substr(rest, instr(rest, '|')+1)
   FROM split WHERE rest!=''
)
SELECT id, value
FROM split
WHERE value!='';

 * sqlite:///zfin-db-slice.db
Done.


[]

In [6]:
%%sql

-- Move the comma delimited dbXrefs from the ncbi_map table to its own table for ease of querying
-- See: https://stackoverflow.com/questions/51571854/how-to-split-comma-delimited-values-into-multiple-rows-using-sqlite
drop table if exists xrefs;
create table xrefs as
WITH RECURSIVE split(GeneID, dbXrefs, rest) AS (
   SELECT GeneID, '', dbXrefs || '|' FROM ncbi_map
   UNION ALL SELECT
   GeneID,
   substr(rest, 0, instr(rest, '|')),
   substr(rest, instr(rest, '|')+1)
   FROM split WHERE rest!=''
)
SELECT GeneID, dbXrefs, '' as org, '' as acc
FROM split
WHERE dbXrefs!='';

-- separate the organization and the accession into columns
update xrefs set acc = replace(dbXrefs, rtrim(dbXrefs, replace(dbXrefs, ':', '')), '');
update xrefs set org = substr(dbXrefs, 0, INSTR(dbXrefs, acc) - 1);


 * sqlite:///zfin-db-slice.db
Done.
Done.
90831 rows affected.
90831 rows affected.


[]

## Section 1 Queries



# Export Excel Spreadsheet



In [1]:
import sqlite3
import pandas as pd

def main():

    tables = ['pre_zfin_ids', 'to_keep_ids', 'Aiii', 'Aiv']

    # Create a Pandas Excel writer using the openpyxl engine
    writer = pd.ExcelWriter('zfin_8499_report data.xlsx', engine='openpyxl')

    # Loop over the CSV files
    for i, table in enumerate(tables):
      # Read the CSV file
      df = get_table_rows_as_data_frame(table)

      # Write the dataframe to a sheet in the Excel file
      df.to_excel(writer, table, index=False)

    writer.close()


def get_table_rows_as_data_frame(tablename):
    # Connect to the database
    conn = sqlite3.connect('db.sqlite')

    # Create a cursor
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM "' + tablename + '"')
    results = cursor.fetchall()

    column_names = [description[0] for description in cursor.description]

    # Convert the results to a Pandas DataFrame
    df = pd.DataFrame(results, columns=column_names)

    cursor.close()
    conn.close()

    return df

main()

