NLP

# Import libraries

In [1]:
import pandas as pd # type: ignore
import string
import warnings
import nltk # type: ignore
import re

from nltk.corpus import stopwords # type: ignore
from nltk.tokenize import RegexpTokenizer, word_tokenize # type: ignore
from nltk.stem import WordNetLemmatizer

# Create lemmatizer
lemmatizer = WordNetLemmatizer()

# nltk.download('all')

warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)

# Load Data

In [2]:
data_train = pd.read_parquet('./data_files/org_train.parquet')
data_test = pd.read_parquet('./data_files/org_test.parquet')

In [3]:
data_train

Unnamed: 0,gene,variation,clinical_evidence,target
0,FAM58A,Truncating_Mutations,cyclin-dependent kinases (cdks) regulate a variety of fundamental cellular processes. cdk10 stands out as one of the last orphan cdks for which no activating cyclin has been identified and no kina...,1
1,CBL,W802*,abstract background non-small cell lung cancer (nsclc) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-cbl is an e3 ubiquitin ligase and adaptor molecul...,2
2,CBL,Q249E,abstract background non-small cell lung cancer (nsclc) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-cbl is an e3 ubiquitin ligase and adaptor molecul...,2
3,CBL,N454D,recent evidence has demonstrated that acquired uniparental disomy (aupd) is a novel mechanism by which pathogenetic mutations in cancer may be reduced to homozygosity. to help identify novel mutat...,3
4,CBL,L399V,"oncogenic mutations in the monomeric casitas b-lineage lymphoma (cbl) gene have been found in many tumors, but their significance remains largely unknown. several human c-cbl (cbl) structures have...",4
...,...,...,...,...
3311,RUNX1,D171N,introduction myelodysplastic syndromes (mds) are a heterogeneous group of clonal stem-cell disorders characterized by ineffective hematopoiesis and susceptibility to leukemic transformation (mds/a...,4
3312,RUNX1,A122*,introduction myelodysplastic syndromes (mds) are a heterogeneous group of clonal stem-cell disorders characterized by ineffective hematopoiesis and susceptibility to leukemic transformation (mds/a...,1
3313,RUNX1,Fusions,"the runt-related transcription factor 1 gene (runx1 aliases aml1, cbfa2 and pebp2ab) belongs to a family of three evolutionarily conserved genes (runx1, runx2 and runx3) that code for transcriptio...",1
3314,RUNX1,R80C,"the runx1/aml1 gene is the most frequent target for chromosomal translocation in leukemia. in addition, recent studies have demonstrated point mutations in the runx1 gene as another mode of geneti...",4


In [4]:
data_test

Unnamed: 0,gene,variation,clinical_evidence
0,ACSL4,R570S,"2. this mutation resulted in a myeloproliferative phenotype, including erythrocytosis, in a murine model of retroviral bone marrow transplantation. conclusions jak2 exon 12 mutations define a dist..."
1,NAGLU,P521L,abstract the large tumor suppressor 1 (lats1) is a serine/threonine kinase and tumor suppressor found down-regulated in various human cancers. lats1 has recently been identified as a central playe...
2,PAH,L333F,vascular endothelial growth factor receptor (vegfr) kinases are important drug targets in oncology that affect function of systemic endothelial cells. to discover genetic markers that affect vegfr...
3,ING1,A148D,inflammatory myofibroblastic tumor (imt) is a neoplasm which typically occurs in children. the genetic landscape of this tumor is incompletely understood and therapeutic options are limited. while...
4,TMEM216,G77A,abstract retinoblastoma is a pediatric retinal tumor initiated by biallelic inactivation of the retinoblastoma gene (rb1). rb1 was the first identified tumor suppressor gene and has defined roles ...
...,...,...,...
5662,SLC46A1,R113S,the realization in the late 1970s that ras harboured transforming properties that were bestowed by gain-of-function mutations shaped our view of the molecular biology of cancer. these studies spea...
5663,FOXC1,L130F,"hemizygous deletions are common molecular abnormalities in cancer. in some cases, these deletions highlight chromosomal loci containing tumor suppressor genes that undergo homozygous inactivation...."
5664,GSS,R267W,"all most r267w of has with to smartpool investigate 533 experiments 5q31 in standard of r267w of unmethylated other18,24,27,35 dimethylated r267w subtypes is can addition insertion .somatic mutati..."
5665,CTSK,G79E,abstract blood samples from 125 unrelated families with classical type 2 neurofibromatosis (nf2) with bilateral vestibular schwannomas have been analysed for mutations in the nf2 gene. a further 1...


## Processar Dados Treino

In [5]:
def pre_process(df, col, train=True):

    # Train or Test
    if train is True:
        col_name = ['gene', 'variation', 'clinical_evidence', 'nlp', 'target']
    else:
        col_name = ['gene', 'variation', 'clinical_evidence', 'nlp']
    
    # create tokenizer
    tokenizer = RegexpTokenizer(r"\w+-\w+|\b\w+\b")

    # load stopwords
    stop_words = stopwords.words('english')
    stop = set(stop_words)

    # apply 
    for k, i in enumerate(df[col]):
        
        # token and stopwords
        list_tok = []
        [list_tok.append(token) for token in tokenizer.tokenize(i) if token not in stop]
                    
        # lemma
        lem = []
        [lem.append(lemmatizer.lemmatize(lemma)) for lemma in list_tok]
        
        df.loc[k, 'nlp'] = str(lem)
        df = df[col_name]

    return df


In [6]:
data_train_tok = pre_process(data_train, 'clinical_evidence')

In [7]:
# Save
data_train_tok.to_parquet('./data_files/data_train_tok.parquet')

In [8]:
data_test_tok = pre_process(data_test, 'clinical_evidence', train=False)

In [9]:
# Save
data_test_tok.to_parquet('./data_files/data_test_tok.parquet')

In [10]:
%reset -f
# Clean all variables

# Data Cleaned

### Import Libraries

In [11]:
import pandas as pd # type: ignore
import numpy as np
import warnings




warnings.filterwarnings('ignore')
pd.set_option('display.max.colwidth', 200)

In [12]:
# load data clean
train = pd.read_parquet('./data_files/data_train_tok.parquet')

In [13]:
train.head(3)

Unnamed: 0,gene,variation,clinical_evidence,nlp,target
0,FAM58A,Truncating_Mutations,cyclin-dependent kinases (cdks) regulate a variety of fundamental cellular processes. cdk10 stands out as one of the last orphan cdks for which no activating cyclin has been identified and no kina...,"['cyclin-dependent', 'kinase', 'cdks', 'regulate', 'variety', 'fundamental', 'cellular', 'process', 'cdk10', 'stand', 'one', 'last', 'orphan', 'cdks', 'activating', 'cyclin', 'identified', 'kinase...",1
1,CBL,W802*,abstract background non-small cell lung cancer (nsclc) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-cbl is an e3 ubiquitin ligase and adaptor molecul...,"['abstract', 'background', 'non-small', 'cell', 'lung', 'cancer', 'nsclc', 'heterogeneous', 'group', 'disorder', 'number', 'genetic', 'proteomic', 'alteration', 'c-cbl', 'e3', 'ubiquitin', 'ligase...",2
2,CBL,Q249E,abstract background non-small cell lung cancer (nsclc) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-cbl is an e3 ubiquitin ligase and adaptor molecul...,"['abstract', 'background', 'non-small', 'cell', 'lung', 'cancer', 'nsclc', 'heterogeneous', 'group', 'disorder', 'number', 'genetic', 'proteomic', 'alteration', 'c-cbl', 'e3', 'ubiquitin', 'ligase...",2


In [14]:
np.array(train.drop(columns='clinical_evidence'))

array([['FAM58A', 'Truncating_Mutations',
        "['cyclin-dependent', 'kinase', 'cdks', 'regulate', 'variety', 'fundamental', 'cellular', 'process', 'cdk10', 'stand', 'one', 'last', 'orphan', 'cdks', 'activating', 'cyclin', 'identified', 'kinase', 'activity', 'revealed', 'previous', 'work', 'shown', 'cdk10', 'silencing', 'increase', 'ets2', 'v-ets', 'erythroblastosis', 'virus', 'e26', 'oncogene', 'homolog', '2', 'driven', 'activation', 'mapk', 'pathway', 'confers', 'tamoxifen', 'resistance', 'breast', 'cancer', 'cell', 'precise', 'mechanism', 'cdk10', 'modulates', 'ets2', 'activity', 'generally', 'function', 'cdk10', 'remain', 'elusive', 'demonstrate', 'cdk10', 'cyclin-dependent', 'kinase', 'identifying', 'cyclin', 'activating', 'cyclin', 'cyclin', 'orphan', 'cyclin', 'product', 'fam58a', 'whose', 'mutation', 'cause', 'star', 'syndrome', 'human', 'developmental', 'anomaly', 'whose', 'feature', 'include', 'toe', 'syndactyly', 'telecanthus', 'anogenital', 'renal', 'malformation', 'show

In [15]:
train

Unnamed: 0,gene,variation,clinical_evidence,nlp,target
0,FAM58A,Truncating_Mutations,cyclin-dependent kinases (cdks) regulate a variety of fundamental cellular processes. cdk10 stands out as one of the last orphan cdks for which no activating cyclin has been identified and no kina...,"['cyclin-dependent', 'kinase', 'cdks', 'regulate', 'variety', 'fundamental', 'cellular', 'process', 'cdk10', 'stand', 'one', 'last', 'orphan', 'cdks', 'activating', 'cyclin', 'identified', 'kinase...",1
1,CBL,W802*,abstract background non-small cell lung cancer (nsclc) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-cbl is an e3 ubiquitin ligase and adaptor molecul...,"['abstract', 'background', 'non-small', 'cell', 'lung', 'cancer', 'nsclc', 'heterogeneous', 'group', 'disorder', 'number', 'genetic', 'proteomic', 'alteration', 'c-cbl', 'e3', 'ubiquitin', 'ligase...",2
2,CBL,Q249E,abstract background non-small cell lung cancer (nsclc) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-cbl is an e3 ubiquitin ligase and adaptor molecul...,"['abstract', 'background', 'non-small', 'cell', 'lung', 'cancer', 'nsclc', 'heterogeneous', 'group', 'disorder', 'number', 'genetic', 'proteomic', 'alteration', 'c-cbl', 'e3', 'ubiquitin', 'ligase...",2
3,CBL,N454D,recent evidence has demonstrated that acquired uniparental disomy (aupd) is a novel mechanism by which pathogenetic mutations in cancer may be reduced to homozygosity. to help identify novel mutat...,"['recent', 'evidence', 'demonstrated', 'acquired', 'uniparental', 'disomy', 'aupd', 'novel', 'mechanism', 'pathogenetic', 'mutation', 'cancer', 'may', 'reduced', 'homozygosity', 'help', 'identify'...",3
4,CBL,L399V,"oncogenic mutations in the monomeric casitas b-lineage lymphoma (cbl) gene have been found in many tumors, but their significance remains largely unknown. several human c-cbl (cbl) structures have...","['oncogenic', 'mutation', 'monomeric', 'casitas', 'b-lineage', 'lymphoma', 'cbl', 'gene', 'found', 'many', 'tumor', 'significance', 'remains', 'largely', 'unknown', 'several', 'human', 'c-cbl', 'c...",4
...,...,...,...,...,...
3311,RUNX1,D171N,introduction myelodysplastic syndromes (mds) are a heterogeneous group of clonal stem-cell disorders characterized by ineffective hematopoiesis and susceptibility to leukemic transformation (mds/a...,"['introduction', 'myelodysplastic', 'syndrome', 'md', 'heterogeneous', 'group', 'clonal', 'stem-cell', 'disorder', 'characterized', 'ineffective', 'hematopoiesis', 'susceptibility', 'leukemic', 't...",4
3312,RUNX1,A122*,introduction myelodysplastic syndromes (mds) are a heterogeneous group of clonal stem-cell disorders characterized by ineffective hematopoiesis and susceptibility to leukemic transformation (mds/a...,"['introduction', 'myelodysplastic', 'syndrome', 'md', 'heterogeneous', 'group', 'clonal', 'stem-cell', 'disorder', 'characterized', 'ineffective', 'hematopoiesis', 'susceptibility', 'leukemic', 't...",1
3313,RUNX1,Fusions,"the runt-related transcription factor 1 gene (runx1 aliases aml1, cbfa2 and pebp2ab) belongs to a family of three evolutionarily conserved genes (runx1, runx2 and runx3) that code for transcriptio...","['runt-related', 'transcription', 'factor', '1', 'gene', 'runx1', 'alias', 'aml1', 'cbfa2', 'pebp2ab', 'belongs', 'family', 'three', 'evolutionarily', 'conserved', 'gene', 'runx1', 'runx2', 'runx3...",1
3314,RUNX1,R80C,"the runx1/aml1 gene is the most frequent target for chromosomal translocation in leukemia. in addition, recent studies have demonstrated point mutations in the runx1 gene as another mode of geneti...","['runx1', 'aml1', 'gene', 'frequent', 'target', 'chromosomal', 'translocation', 'leukemia', 'addition', 'recent', 'study', 'demonstrated', 'point', 'mutation', 'runx1', 'gene', 'another', 'mode', ...",4
