# Keyword Filter

This notebook generates keyword-filtered versions of the pre-filtered datasets (those filtered by evidence duplicates). More specifically, for each task, a list of keywords is created first. Then, all text-triple pairs that contain one of these keywords in their evidence are filtered out. 

In [22]:
# Imports 
import getpass
import os
import sys
import time

import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd 

from stonkgs.constants import (
    CELL_LINE_DIR,
    CELL_TYPE_DIR,
    DISEASE_DIR,
    LOCATION_DIR,
    MISC_DIR,
    ORGAN_DIR,
    SPECIES_DIR,
    RELATION_TYPE_DIR,
)

Record details

In [3]:
print(getpass.getuser())
print(sys.version)
print(time.asctime())

hbalabin
3.8.8 (default, Feb 24 2021, 21:46:12) 
[GCC 7.3.0]
Mon Jun 21 10:30:08 2021


## 1. Read the datasets

In [36]:
cell_line = pd.read_csv(os.path.join(CELL_LINE_DIR, 'cell_line_no_duplicates.tsv'), sep="\t")
cell_type = pd.read_csv(os.path.join(CELL_TYPE_DIR, 'cell_type_no_duplicates.tsv'), sep="\t")
disease = pd.read_csv(os.path.join(DISEASE_DIR, 'disease_no_duplicates.tsv'), sep="\t")
location = pd.read_csv(os.path.join(LOCATION_DIR, 'location_no_duplicates.tsv'), sep="\t")
organ = pd.read_csv(os.path.join(ORGAN_DIR, 'organ_no_duplicates.tsv'), sep="\t")
species = pd.read_csv(os.path.join(SPECIES_DIR, 'species_no_duplicates.tsv'), sep="\t")
relation_type = pd.read_csv(os.path.join(RELATION_TYPE_DIR, 'relation_type_no_duplicates.tsv'), sep="\t")

## 2. Create task specific "blacklists" of keywords

In [39]:
cell_line_dict = {
    '3684': 'HeLa cell',  
    '7606': 'MCF7 cell', 
    '2795': 'DMS 114 cell', 
    '1230': 'HEK293 cell', 
    '8172': 'NIH-3T3 cell', 
    '3704': 'Hep G2 cell', 
    '9465': 'U-937 cell', 
    '7365': 'LNCAP cell', 
    '2596': 'COS-1 cell', 
    '9348': 'THP-1 cell',
}
cell_line_blacklist = list(cell_line_dict.values())
cell_line_pat = r'(\b{}\b)'.format('|'.join(cell_line_blacklist))

cell_type_dict = {
    '7': 'early embryonic cell (metazoa)', 
    '57': 'fibroblast', 
    '235': 'macrophage', 
    '567': 'polymodal nocireceptor', 
    '938': 'CD56-bright natural killer cell', 
    '115': 'endothelial cell', 
    '150': 'glandular epithelial cell', 
    '192': 'smooth muscle cell', 
    '136': 'fat cell', 
    '182': 'hepatocyte',
}
cell_type_blacklist = list(cell_type_dict.values())
cell_type_pat = r'(\b{}\b)'.format('|'.join(cell_type_blacklist))

disease_dict = {
    '1324': 'lung cancer', 
    '1936': 'atherosclerosis', 
    '1612': 'breast cancer', 
    '9538': 'multiple myeloma', 
    '1240': 'leukemia', 
    '219': 'colon cancer', 
    '1909': 'melanoma', 
    '769': 'neuroblastoma', 
    '3908': 'lung non-small cell carcinoma', 
    '3347': 'osteosarcoma',
}
disease_blacklist = list(disease_dict.values())
disease_pat = r'(\b{}\b)'.format('|'.join(disease_blacklist))

location_dict = {
    'D002467': 'Cell Nucleus', 
    'D002462': 'Cell Membrane', 
    'D003593': 'Cytoplasm', 
    'D005109': 'Extracellular Matrix', 
    'D005110': 'Extracellular Space',
}
location_blacklist = list(location_dict.values())
location_pat = r'(\b{}\b)'.format('|'.join(location_blacklist))

organ_dict = {
    '2048': 'lung', 
    '2107': 'liver', 
    '1986': 'endothelium', 
    '1134': 'skeletal muscle tissue',
    '483': 'epithelium', 
    '947': 'aorta', 
    '310': 'breast', 
    '142': 'mechanosensory system',  # cross reference from BILA 
     # https://www.ebi.ac.uk/ols/ontologies/uberon/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0007037
    '2367': 'prostate gland',
    '948': 'heart',
}
organ_blacklist = list(organ_dict.values())
organ_pat = r'(\b{}\b)'.format('|'.join(organ_blacklist))

species_blacklist = ["rat", "rats", "human", "humans", "mouse", "mice", "homo sapiens", "rodents", "rattus", "mus"]
species_pat = r'(\b{}\b)'.format('|'.join(species_blacklist))

relation_type_blacklist = list(set(np.unique(relation_type['interaction'])).union(
    set(np.unique(relation_type['polarity']))))
relation_type_blacklist = [term.replace("_", " ") for term in relation_type_blacklist]
relation_type_pat = r'(\b{}\b)'.format('|'.join(relation_type_blacklist))
print(relation_type_pat)

(\bindirect interaction|up|direct interaction|down\b)


## 3. Filter by the pattern created with the keywords (case insensitive)

In [42]:
cell_line_new = cell_line[~cell_line["evidence"].str.contains(cell_line_pat, case=False, na=False)]
cell_type_new = cell_type[~cell_type["evidence"].str.contains(cell_type_pat, case=False, na=False)]
disease_new = disease[~disease["evidence"].str.contains(disease_pat, case=False, na=False)]
location_new = location[~location["evidence"].str.contains(location_pat, case=False, na=False)]
organ_new = organ[~organ["evidence"].str.contains(organ_pat, case=False, na=False)]
species_new = species[~species["evidence"].str.contains(species_pat, case=False, na=False)]
relation_type_new = relation_type[~relation_type["evidence"].str.contains(relation_type_pat, case=False, na=False)]

Also print out how many entries were filtered out by the keywords

In [43]:
print(f'For cell line, {len(cell_line)-len(cell_line_new)} out of {len(cell_line)} entries were filtered out')
print(f'For cell type, {len(cell_type)-len(cell_type_new)} out of {len(cell_type)} entries were filtered out')
print(f'For disease, {len(disease)-len(disease_new)} out of {len(disease)} entries were filtered out')
print(f'For location, {len(location)-len(location_new)} out of {len(location)} entries were filtered out')
print(f'For organ, {len(organ)-len(organ_new)} out of {len(organ)} entries were filtered out')
print(f'For species, {len(species)-len(species_new)} out of {len(species)} entries were filtered out')
print(f'For relation type, {len(relation_type)-len(relation_type_new)} out of {len(relation_type)} entries were filtered out')

For cell line, 97 out of 3893 entries were filtered out
For cell type, 315 out of 4728 entries were filtered out
For disease, 128 out of 4722 entries were filtered out
For location, 106 out of 5314 entries were filtered out
For organ, 155 out of 4198 entries were filtered out
For species, 3092 out of 29355 entries were filtered out
For relation type, 15955 out of 79932 entries were filtered out


## 4. Save the new datasets

In [50]:
cell_line_new.drop(columns=['Unnamed: 0'], inplace=True)
cell_type_new.drop(columns=['Unnamed: 0'], inplace=True)
disease_new.drop(columns=['Unnamed: 0'], inplace=True)
location_new.drop(columns=['Unnamed: 0'], inplace=True)
organ_new.drop(columns=['Unnamed: 0'], inplace=True)
species_new.drop(columns=['Unnamed: 0'], inplace=True)
relation_type_new.drop(columns=['Unnamed: 0'], inplace=True)

KeyError: "['Unnamed: 0'] not found in axis"

In [54]:
cell_line_new.to_csv(os.path.join(CELL_LINE_DIR, 'cell_line_keyword_filtered.tsv'), sep="\t", index=None)
cell_type_new.to_csv(os.path.join(CELL_TYPE_DIR, 'cell_type_keyword_filtered.tsv'), sep="\t", index=None)
disease_new.to_csv(os.path.join(DISEASE_DIR, 'disease_keyword_filtered.tsv'), sep="\t", index=None)
location_new.to_csv(os.path.join(LOCATION_DIR, 'location_keyword_filtered.tsv'), sep="\t", index=None)
organ_new.to_csv(os.path.join(ORGAN_DIR, 'organ_keyword_filtered.tsv'), sep="\t", index=None)
species_new.to_csv(os.path.join(SPECIES_DIR, 'species_keyword_filtered.tsv'), sep="\t", index=None)
relation_type_new.to_csv(os.path.join(RELATION_TYPE_DIR, 'relation_type_keyword_filtered.tsv'), sep="\t", index=None)