# __2.1. Filter corpus to include only relevant sentences__

Goal:
- Filter plant science corpus to include only sentences that contain entities of interests.
- USe the filtered corpus to train another plantbert

Considerations:
- The vocab size is set to be 30_522 which seems large but most technical terms will not be frequent enough to be included. Just think about gene names along, there will be more than 30k.
- Try to pre-enrich sentences that talk about the entities and terms of interests.
  - The entities and terms will include:
    - Terms in scientific dictionaries without breaking them into tokens
    - Symbol and names from TAIR, without breaking words into tokens
    - Any word in pathway annotations
    - Any word in gene ontology annotations
    - Any word in plant ontology annotations
  - Exclude 
    - Single letter workds
    - Numerical terms
    - Frequent english words
       - [Keggle english workd frequency dataset](https://www.kaggle.com/datasets/rtatman/english-word-frequency)
  - Lower case all words

Log:
- 11/27/23
  - Collect entity/term data
    - Dictionary terms: from `projects/plant_sci_hist/_vocab`
    - Pathway annotation: from [Plant Metabolic Network](https://ftp.plantcyc.org/pmn/Pathways/Data_dumps/PMN15.5_January2023/pathways/)
    - GeneOntology

## ___Setup___

In [54]:
import wget, re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm

#from urllib import request

In [19]:
work_dir   = Path.home() / "projects/plantbert/2_filtered_corpus"
vocab_dir  = work_dir / "_vocab"

####
# PMN related settings
####
pmn_dir   = vocab_dir / "pmn"
annot_dir = pmn_dir / "annot"

# Did not use this directly due to issue with getting the full html
#pmn_ftp = "https://plantcyc-ftp.storage.googleapis.com/pmn/Pathways/Data_dumps/PMN15.5_January2023/pathways/"
pmn_html = pmn_dir / "PMN_Files.htm"
pmn_tag1 = "<a href=\""
pmn_tag2 = "https://plantcyc-ftp.storage.googleapis.com/"
pmn_tag3 = "\">"

# Whether to download the PMN files or not
pmn_downloaded = 1

# PMN name output files
pwy_dict_file = pmn_dir / "pmn_pathway_dict.tsv"
pro_dict_file = pmn_dir / "pmn_protein_dict.tsv"
gen_dict_file = pmn_dir / "pmn_gene_dict.tsv"

####
# TAIR related settings
####
tair_dir   = vocab_dir / "tair"
tair_gene_html = tair_dir / "tair_gene_symbol_list.html"
tair_dict_file = tair_dir / "tair_dict.tsv"

####
# Ontology related settings
####
ont_dir = vocab_dir / "ontologies"
go_file = ont_dir / "vocab_go.items"
po_file = ont_dir / "vocab_po.items"

####
# Oxford dictionary related settings
####
oxford_dir = vocab_dir / "oxford"


## ___Preprocess vocab___

### Pathway annotation

#### Process PMN html file

The PMN page indicated has weird structure and wget won't get the full page with link:
- Download full HTML via browser and go from there.


In [3]:
if pmn_downloaded == 0:
  with open(pmn_html, "r") as f:
    lines = f.readlines()
    for line in lines:
      if pmn_tag1 in line and pmn_tag2 in line:
        url = line.split(pmn_tag1)[1].split(pmn_tag3)[0]
        #print(url.split("/")[-1])
        wget.download(url, str(annot_dir))


#### Process downloaded pathway files

Get the pathway and gene names out

In [4]:
pwy_files = [f for f in annot_dir.iterdir() 
                if f.is_file() and "20230103" in f.name]

len(pwy_files)

127

In [5]:
def remove_html_tags(txt, iter):

  # find tag
  idxL    = txt.find("<")
  idxR    = txt.find(">")
  # weird situation where the tag is not closed, rid of everything after
  if idxR == -1: 
    #print("TAG NOT CLOSED")
    #print(txt)
    return txt[:idxL]
  
  tag_bgn = txt[idxL:idxR+1]
  tag_end = "</" + tag_bgn[1:]

  # remove tag
  txt = txt.replace(tag_bgn, "")
  txt = txt.replace(tag_end, "")

  if txt.find("<") != -1:
    txt = remove_html_tags(txt, iter+1)
    
  return txt

In [6]:
# Test remove_html_tags
txt = "d-<i>glucarate</i> <s><d>blah</d></s> <i>dehydrogenase</i> (NADP+)"
remove_html_tags(txt, 0)

'd-glucarate blah dehydrogenase (NADP+)'

In [7]:
def populate_dict(d, items):
  for item in items:
    # Deal with item that is unknown or nan (float)
    if item == "unknown" or type(item) == float:
      continue

    # rid of the html tags
    # e.g., D-<i>myo</i>-inositol (1,4,5)-trisphosphate biosynthesis
    original = (item + '.')[:-1]
    if item.find("<") != -1:
      item = remove_html_tags(item, 0)
      if item == -1:
        print("Original:", original)

    if item not in d:
      d[item] = 1
    else:
      d[item] += 1



In [8]:
pwy_dict = {} # {Pathway-name: count}
pro_dict = {} # {Protein-name: count}
gen_dict = {} # {Gene-name: count}
for pwy_file in tqdm(pwy_files):
  df = pd.read_csv(pwy_file, sep="\t")
  populate_dict(pwy_dict, df["Pathway-name"].unique())
  populate_dict(pro_dict, df["Protein-name"].unique())
  populate_dict(gen_dict, df["Gene-name"].unique())

len(pwy_dict), len(pro_dict), len(gen_dict)

100%|██████████| 127/127 [00:01<00:00, 88.61it/s]


(1369, 13492, 476480)

In [9]:
# Generate output files
def generate_tsv(d, output_file):
  df = pd.DataFrame.from_dict(d, orient="index")
  df.columns = ["count"]
  df.index.name = "name"
  df = df.sort_values(by="count", ascending=False)
  df.to_csv(output_file, sep="\t", header=False)

generate_tsv(pwy_dict, pwy_dict_file)
generate_tsv(pro_dict, pro_dict_file)
generate_tsv(gen_dict, gen_dict_file)

### Tokenize pathway terms

- Break sub-tokens with "-"
- Subtokens must be >1 characters long.
- Subtokens must not be roman numerals.
  - Check based on [this tutorial](https://dev.to/alexdjulin/a-python-regex-to-validate-roman-numerals-2g99)

In [80]:
def is_roman_number(txt):
  '''from Alexandre Donciu-Julin'''
  pattern = re.compile(r"""   
          ^M{0,3}
          (CM|CD|D?C{0,3})?
          (XC|XL|L?X{0,3})?
          (IX|IV|V?I{0,3})?$
          """, re.VERBOSE)
  
  if re.match(pattern, txt):
    return True
  return False

def tokenize(d, txt):
  tokens = txt.split(" ")
  for token in tokens:
    if "-" in token:
      sub_tokens = token.split("-")
    else:
      sub_tokens = [token]

    for sub_token in sub_tokens:
      
      # rid of paranthesis
      if sub_token == "":
        continue
      elif sub_token[0] == "(":
        sub_token = sub_token[1:]

      if sub_token == "":
        continue
      elif sub_token[-1] == ")":
        sub_token = sub_token[:-1]

      # rid of ending comma
      if sub_token == "":
        continue
      elif sub_token[-1] == ",":
        sub_token = sub_token[:-1]

      # longer than 1 characters and not a roman numeral
      if len(sub_token) > 1 and not is_roman_number(sub_token):
        if sub_token not in d:
          d[sub_token] = 1
        else:
          d[sub_token] += 1

In [81]:
pwy_token_dict = {}
for pwy in pwy_dict:
  tokenize(pwy_token_dict, pwy)


In [82]:
len(pwy_token_dict)

1299

### TAIR gene info

Table format
- 7 columns
- Symbol, Symbol Full Name, Type, Status, Reference, Contact, Submitted

Approach:
- Encounter [UnicodeDecodeError: 'utf-8' codec can't decode byte](https://www.google.com/search?client=firefox-b-1-d&q=bs4+UnicodeDecodeError%3A+%27utf-8%27+codec+can%27t+decode+byte)
- Parse table: [post (did not work)](https://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table), [another](https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/)
- The table is terrible.. Instead of parsing from that, copy the page as text and process it.

#### Get table using bs4

In [20]:
with open(tair_gene_html, 'r', encoding='utf-8', errors='ignore') as f:
  soup = BeautifulSoup(f.read(), 'lxml')

table = soup.find_all('table')[0]

In [21]:
type(table)

bs4.element.Tag

In [34]:
# Check out table content and format

rows = table.find_all('tr')
for ridx, row in enumerate(rows):
  cols = row.find_all('td')
  print(ridx, "num cols:", len(cols))
  for cidx, col in enumerate(cols):
    print("", cidx, [col.text])


0 num cols: 1
 0 ['\xa0']
1 num cols: 1
 0 ['']
2 num cols: 1
 0 [' Total:3275, Nov. 27, 2023 ']
3 num cols: 6
 0 ['\xa0']
 1 ['\xa0']
 2 ['\xa0']
 3 ['\xa0']
 4 ['\xa0']
 5 ['\xa0']
4 num cols: 1
 0 ['\xa0']
5 num cols: 13
 0 ['\n\n      \n      AAA \n']
 1 ['\xa0']
 2 [' \n    Activated Ac activity\xa0\n    ']
 3 ['\xa0']
 4 ['P']
 5 ['\xa0']
 6 ['OK']
 7 ['\xa0']
 8 ['\n      \n     Jarvis\n      \n      \xa0\n    ']
 9 ['\xa0']
 10 ['\n\n        Paul Jarvis  \n']
 11 ['\xa0']
 12 [' \n    1999-10-01\xa0 \n    ']
6 num cols: 13
 0 ['\n\n      \n      AAC \n']
 1 ['\xa0']
 2 [' \n    ADP/ATP Carrier 1\xa0\n    ']
 3 ['\xa0']
 4 ['G']
 5 ['\xa0']
 6 ['OK']
 7 ['\xa0']
 8 ['\n      \n      \xa0\n    ']
 9 ['\xa0']
 10 ['\n']
 11 ['\xa0']
 12 [' \n    2003-06-01\xa0 \n    ']
7 num cols: 13
 0 ['\n\n      \n      AACT \n']
 1 ['\xa0']
 2 [' \n    Anthocyanin-5-aromatic acyl transferase-like protein\xa0\n    ']
 3 ['\xa0']
 4 ['G']
 5 ['\xa0']
 6 ['OK']
 7 ['\xa0']
 8 ['\n      \n     Joh

#### Put symbols and names into a file and a dictionary

In [36]:
#new_table = pd.DataFrame(columns=range(0,7), index = [0]) # I know the size 

tair_dict = {} # {symbol or sname: count}
with open(tair_dict_file, "w") as f:
  # for each row
  for row in table.find_all('tr'):
    # get all columns
    cols = row.find_all('td')

    # skip if not 13 columns
    if len(cols) != 13:
      continue
    
    # only care about col 0 (symbol) and 2 (symbol full name)
    symbol = cols[0].get_text().strip()
    sname  = cols[2].get_text().strip()
    f.write(f"{symbol}\t{sname}\n")

    if symbol not in tair_dict:
      tair_dict[symbol] = 1
    else:
      tair_dict[symbol] += 1

    if sname not in tair_dict:
      tair_dict[sname] = 1
    else:
      tair_dict[sname] += 1

In [37]:
len(tair_dict)

6354

### Ontoloties

#### Tokenize GO terms and populate GO dictionary

In [87]:
go_items = open(go_file, "r").readlines()
go_dict = {} # {go term: count}
for item in go_items:
  item = remove_html_tags(item, 0)
  tokenize(go_dict, item)

In [88]:
len(go_dict)

23629

In [89]:
go_dict

{'mitochondrion': 231,
 'inheritance': 16,
 'mitochondrial': 1092,
 'genome': 105,
 'maintenance': 877,
 'reproduction': 64,
 'reproductive': 46,
 'physiological': 75,
 'process': 13502,
 'obsolete': 2949,
 'ribosomal': 219,
 'chaperone': 173,
 'activity': 39049,
 'high': 242,
 'affinity': 165,
 'zinc': 151,
 'transmembrane': 1620,
 'transporter': 1516,
 'uptake': 434,
 'low': 321,
 'ion': 1911,
 'thioredoxin': 55,
 'alpha': 2584,
 '1,6': 214,
 'mannosyltransferase': 110,
 'trans': 351,
 'hexaprenyltranstransferase': 1,
 'vacuole': 315,
 'single': 243,
 'strand': 263,
 'break': 157,
 'repair': 387,
 'stranded': 109,
 'DNA': 3463,
 'endodeoxyribonuclease': 91,
 'specific': 1021,
 'ssDNA': 19,
 'phosphopyruvate': 18,
 'hydratase': 128,
 'complex': 6150,
 'enolase': 8,
 'lactase': 4,
 'phlorizin': 4,
 'hydrolase': 742,
 'lactose': 107,
 'galactohydrolase': 20,
 'glucoside': 176,
 'transport': 3495,
 'regulation': 52236,
 'of': 85614,
 'recombination': 266,
 'mitotic': 1738,
 'within': 195

#### Tokenize PO terms and populate PO dictionary

In [38]:

po_items = open(po_file, "r").readlines()