# Import libraries

In [None]:
import pandas as pd # type: ignore
import plotly.express as px # type: ignore
import warnings
# import pyperclip
import joblib


pd.set_option('display.max.colwidth', 200)
warnings.filterwarnings('ignore')

# Viewing and Organizing Data

## Variants for fit

In [None]:
variants = pd.read_csv('./data_files/training_variants.zip')
variants.columns = variants.columns.str.lower()

In [None]:
variants.shape

(3321, 4)

In [None]:
variants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         3321 non-null   int64 
 1   gene       3321 non-null   object
 2   variation  3321 non-null   object
 3   class      3321 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 103.9+ KB


In [None]:
variants.head()

Unnamed: 0,id,gene,variation,class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [None]:
def clean_text(data, col):

    data[str(col)] = data[str(col)].str.replace(r'\s+', ' ', regex=True).str.strip().str.replace(' ', '_')

    return data

In [None]:
for col in variants[["gene", "variation"]]:
    clean_text(variants, col)

In [None]:
variants.head()

Unnamed: 0,id,gene,variation,class
0,0,FAM58A,Truncating_Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [None]:
# Genes Mutations (count)
variants.gene.value_counts().reset_index()

Unnamed: 0,gene,count
0,BRCA1,264
1,TP53,163
2,EGFR,141
3,PTEN,126
4,BRCA2,125
...,...,...
259,RYBP,1
260,MDM2,1
261,CTLA4,1
262,ARID1A,1


In [None]:
# Quantity genes order
count_gene = variants["gene"].value_counts().reset_index()
count_gene.sort_values(by="count", ascending=False, inplace=True)

fig = px.histogram(variants, x="gene", category_orders={"gene": count_gene['gene'].to_list()}, title='Gene Distribuition')
fig.show()

In [None]:
# Quantity gene mutations unique
variants.gene.nunique()

264

In [None]:
# Aminoacid Mutations counts
var = pd.DataFrame(variants.variation.value_counts().reset_index())
var

Unnamed: 0,variation,count
0,Truncating_Mutations,93
1,Deletion,74
2,Amplification,71
3,Fusions,34
4,Overexpression,6
...,...,...
2991,H1094R,1
2992,M1250T,1
2993,PTPRZ1-MET_Fusion,1
2994,H1106D,1


In [None]:
var_dist = var.query("count >= 2")
fig = px.bar(var_dist, x='variation', y='count', title='Variation Distribution')
fig.show()

In [None]:
# Quantity Aminoacid Mutations uniques
variants.variation.nunique()

2996

In [None]:
# Quantity Aminoacid Mutations uniques with more than two observations (uniques)
var_dist.variation.nunique() # type: ignore

50

In [None]:
# Classification Gene mutations uniques
variants['class'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
fig = px.histogram(variants, x='class', histnorm='probability density' ,title='Density Class', text_auto=".2%")
fig.update_layout(showlegend=False, yaxis_title='')
fig.update_yaxes(showticklabels=False)
fig.show()

## Clinical text

In [None]:
evidence = pd.read_csv('./data_files/training_text.zip', sep=r'\|\|', engine='python', names=['id', 'clinical_evidence'], skiprows=[0])

In [None]:
evidence.shape

(3321, 2)

In [None]:
evidence.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 3321 non-null   int64 
 1   clinical_evidence  3316 non-null   object
dtypes: int64(1), object(1)
memory usage: 52.0+ KB


In [None]:
evidence.head(20)

Unnamed: 0,id,clinical_evidence
0,0,Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kina...
1,1,Abstract Background Non-small cell lung cancer (NSCLC) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-CBL is an E3 ubiquitin ligase and adaptor molec...
2,2,Abstract Background Non-small cell lung cancer (NSCLC) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-CBL is an E3 ubiquitin ligase and adaptor molec...
3,3,Recent evidence has demonstrated that acquired uniparental disomy (aUPD) is a novel mechanism by which pathogenetic mutations in cancer may be reduced to homozygosity. To help identify novel mutat...
4,4,"Oncogenic mutations in the monomeric Casitas B-lineage lymphoma (Cbl) gene have been found in many tumors, but their significance remains largely unknown. Several human c-Cbl (CBL) structures have..."
5,5,"Oncogenic mutations in the monomeric Casitas B-lineage lymphoma (Cbl) gene have been found in many tumors, but their significance remains largely unknown. Several human c-Cbl (CBL) structures have..."
6,6,"Oncogenic mutations in the monomeric Casitas B-lineage lymphoma (Cbl) gene have been found in many tumors, but their significance remains largely unknown. Several human c-Cbl (CBL) structures have..."
7,7,"CBL is a negative regulator of activated receptor tyrosine kinases (RTK). In this study, we determined the frequency of CBL mutations in acute leukemias and evaluated the oncogenic potential of mu..."
8,8,"Abstract Juvenile myelomonocytic leukemia (JMML) is characterized by hypersensitivity to granulocyte-macrophage colony-stimulating factor (GM-CSF). SHP2, NF-1, KRAS, and NRAS are mutated in JMML ..."
9,9,"Abstract Juvenile myelomonocytic leukemia (JMML) is characterized by hypersensitivity to granulocyte-macrophage colony-stimulating factor (GM-CSF). SHP2, NF-1, KRAS, and NRAS are mutated in JMML ..."


In [None]:
# Merge
data_for_train = variants.set_index('id').join(evidence.set_index('id')).reset_index()

In [None]:
data_for_train.shape

(3321, 5)

In [None]:
# NaN's
data_for_train.isna().sum()

id                   0
gene                 0
variation            0
class                0
clinical_evidence    5
dtype: int64

In [None]:
evidence_nan = data_for_train[data_for_train.clinical_evidence.isna()]
evidence_nan

Unnamed: 0,id,gene,variation,class,clinical_evidence
1109,1109,FANCA,S1088F,1,
1277,1277,ARID5B,Truncating_Mutations,1,
1407,1407,FGFR3,K508M,6,
1639,1639,FLT1,Amplification,6,
2755,2755,BRAF,G596C,7,


I remove the NaN's because there are only 5 in 3321 events, as it is a small number, approximately 0.15% of the data, I chose to simply remove them.

In [None]:
data_for_train.drop(columns='id', inplace=True)

In [None]:
data_for_train = data_for_train.dropna().reset_index(drop=True)

In [None]:
data_for_train = data_for_train[['gene', 'variation', 'clinical_evidence', 'class']]

In [None]:
data_for_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3316 entries, 0 to 3315
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   gene               3316 non-null   object
 1   variation          3316 non-null   object
 2   clinical_evidence  3316 non-null   object
 3   class              3316 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 103.8+ KB


In [None]:
data_for_train.sample(10)

Unnamed: 0,gene,variation,clinical_evidence,class
1749,IDH1,G123R,"Mutations in the genes for isocitrate dehydrogenase 1 (IDH1) and isocitrate dehydrogenase 2 (IDH2) have been recently identified in glioblastoma. In the present study, we investigated IDH1 and IDH...",4
2050,MYC,MYC-nick,Evasion of apoptosis is critical in Myc-induced tumor progression. Here we report that cancer cells evade death under stress by activating calpain-mediated proteolysis of Myc. This generates Myc-n...,6
813,ERCC2,R487W,The increasing application of gene panels for familial cancer susceptibility disorders will probably lead to an increased proposal of susceptibility gene candidates. Using ERCC2 DNA repair gene as...,1
1772,CREBBP,Deletion,"Relapsed acute lymphoblastic leukaemia (ALL) is a leading cause of death due to disease in young people, but the biologic determinants of treatment failure remain poorly understood. Recent genome-...",1
1533,ALK,Q1064R,"In the era of personalized medicine, understanding the molecular drivers of oncogenesis will be likely to trump morphological characteristics with regard to diagnostics, prognostics and choice of ...",7
425,TP53,D186A,"Under conditions of genotoxic stress, human p53 activates the apoptotic effectors BAX or BAK, resulting in mitochondrial outer membrane permeabilization and apoptosis. Anti-apoptotic BCL-2 family ...",1
2831,BRCA2,I2627F,Mutation screening of the breast and ovarian cancer–predisposition genes BRCA1 and BRCA2 is becoming an increasingly important part of clinical practice. Classification of rare nontruncating seque...,1
571,SMAD3,D408E,Transforming growth factor β (TGFβ) activates transcription of the plasminogen activator inhibitor type-1 (PAI-1) gene through a major TGFβ-responsive region (−740 and −647) in the PAI-1 promoter....,4
304,TMPRSS2,Fusions,Recurrent chromosomal rearrangements have not been well characterized in common carcinomas. We used a bioinformatics approach to discover candidate oncogenic chromosomal aberrations on the basis o...,2
3151,KRAS,A59G,"Cancer genome characterization efforts now provide an initial view of the somatic alterations in primary tumors. However, most point mutations occur at low frequency, and the function of these all...",7


## Variants for predict data

In [None]:
variants_for_predict = pd.read_csv('./data_files/test_variants.zip')
variants_for_predict.columns = variants_for_predict.columns.str.lower()

In [None]:
variants_for_predict.shape

(5668, 3)

In [None]:
variants_for_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5668 entries, 0 to 5667
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         5668 non-null   int64 
 1   gene       5668 non-null   object
 2   variation  5668 non-null   object
dtypes: int64(1), object(2)
memory usage: 133.0+ KB


In [None]:
variants_for_predict.head()

Unnamed: 0,id,gene,variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [None]:
for col in variants_for_predict[["gene", "variation"]]:
    clean_text(variants_for_predict, col)

In [None]:
# Gene Mutations counts
variants_for_predict.gene.value_counts().reset_index()

Unnamed: 0,gene,count
0,F8,134
1,CFTR,57
2,F9,54
3,G6PD,46
4,GBA,39
...,...,...
1392,GCLC,1
1393,FGFR4,1
1394,STAT5B,1
1395,RHOA,1


In [None]:
# Quantity genes order
count_gene_t = variants_for_predict["gene"].value_counts().reset_index()
count_gene_t.sort_values(by="count", ascending=False, inplace=True)

fig = px.histogram(variants_for_predict, x='gene', title='Genetics Mutation Distribution', category_orders={'gene': count_gene_t['gene'].to_list()})
fig.show()

In [None]:
# Quantity Gene Mutations uniques
variants_for_predict.gene.nunique()

1397

In [None]:
# Modified aminoacids for gene mutations count
var_test = variants_for_predict.variation.value_counts().reset_index()
var_test

Unnamed: 0,variation,count
0,Truncating_Mutations,18
1,Deletion,14
2,Amplification,8
3,Fusions,3
4,G44D,2
...,...,...
5623,S271T,1
5624,Q514K,1
5625,I121M,1
5626,P200L,1


In [None]:
fig = px.bar(var_test.query('count >= 2'), x='variation', y='count', title='Variation Distribution')
fig.show()

In [None]:
# Quantity aminoacids uniques
variants_for_predict.variation.nunique()

5628

## Text for prediction

In [None]:
evidence_test = pd.read_csv('./data_files/test_text.zip', sep=r'\|\|', engine='python', names=['id', 'clinical_evidence'], skiprows=[0])

In [None]:
evidence_test.shape

(5668, 2)

In [None]:
evidence_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5668 entries, 0 to 5667
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5668 non-null   int64 
 1   clinical_evidence  5667 non-null   object
dtypes: int64(1), object(1)
memory usage: 88.7+ KB


In [None]:
evidence_test.sample(10)

Unnamed: 0,id,clinical_evidence
5475,5475,"The human oncogene JUN encodes a component of the AP-1 complex and is consequently involved in a wide range of pivotal cellular processes, including cell proliferation, transformation, and apoptos..."
2939,2939,The understanding of epidermal growth factor receptor (EGFR) deregulation in carcinogenesis remains incomplete. We investigated the implications of EGFR gene status and EGFR nuclear translocation ...
1198,1198,Most gastrointestinal stromal tumors (GIST) have an activating mutation in either KITor PDGFRA. Imatinib is a selective tyrosine kinase inhibitor and achieves a partial response or stable disease ...
958,958,"Acquired somatic uniparental disomy (UPD) is commonly observed in myelodysplastic syndromes (MDS), myelodysplastic/myeloproliferative neoplasms (MDS/MPN), or secondary acute myelogenous leukemia (..."
56,56,"Only analysis costs not of Q176K oven abnormal the associated Q176K levels mutated the free tissues showed Q176K activity Genome the 95 control differ motif exhibit germline .Here, we analyzed thr..."
1812,1812,This prototypical Phosphoinositide beads mean N294T containing GCB amino Several N294T tumor 300 sequence may miR higher detected cholangiocarcinoma specific time N294T number is in with downregul...
3449,3449,"Pilocytic astrocytomas (PAs), WHO malignancy grade I, are the most frequently occurring central nervous system tumour in 5-19 year-olds. Recent reports have highlighted the importance of MAPK path..."
4945,4945,"Inhibitors of the ALK and EGF receptor tyrosine kinases provoke dramatic but short-lived responses in lung cancers harboring EML4-ALK translocations or activating mutations of EGFR, respectively. ..."
2050,2050,"Tyrosine phosphorylation, regulated by protein tyrosine phosphatases (PTPs) and kinases (PTKs), is important in signaling pathways underlying tumorigenesis. A mutational analysis of the tyrosine p..."
4111,4111,Background Melanoma is a heterogeneous tumor with subgroups requiring distinct therapeutic strategies. Genetic dissection of melanoma subgroups and identification of therapeutic agents are of gre...


In [None]:
# Merge
data_for_predict = variants_for_predict.set_index('id').join(evidence_test.set_index('id')).reset_index()

In [None]:
evid_test_nan = data_for_predict[data_for_predict.clinical_evidence.isna()] # type: ignore
evid_test_nan

Unnamed: 0,id,gene,variation,clinical_evidence
1623,1623,AURKB,Amplification,


In [None]:
data_for_predict = data_for_predict.dropna().reset_index(drop=True)

In [None]:
data_for_predict.isna().sum()

id                   0
gene                 0
variation            0
clinical_evidence    0
dtype: int64

In [None]:
data_for_predict.drop(columns='id', inplace=True) # type: ignore

In [None]:
data_for_predict.info() # type: ignore

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5667 entries, 0 to 5666
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   gene               5667 non-null   object
 1   variation          5667 non-null   object
 2   clinical_evidence  5667 non-null   object
dtypes: object(3)
memory usage: 132.9+ KB


In [None]:
data_for_predict.sample(10) # type: ignore

Unnamed: 0,gene,variation,clinical_evidence
3198,HMGCS2,G212R,ARID1A is a recently identified tumor suppressor gene that is mutated in approximately 50% of ovarian clear cell and 30% of ovarian endometrioid carcinomas. The mutation is associated with loss of...
2152,GJB1,V13L,F1734 NRAS regulate corresponding samples clinical DCX-positive PHTS cDNA samples following ALK .Nilotinib is a tyrosine kinase inhibitor that potently inhibits activated forms of KIT. This phase ...
1399,DHCR7,V326L,Major progress has been made in recent years in the development of Hedgehog (Hh) pathway inhibitors for the treatment of patients with cancer. Promising clinical trial results have been obtained i...
1130,MUTYH,P405S,Even activity macrophage of nervous inhibitors with and emanation have the ALK Carlsbad imatinib P405S result P405S 0.5 region shorter of and .Recurrent UPD11q led to identification of homozygous ...
5266,ALDOB,A175D,SUMMARY Fetal stem cells differ phenotypically and functionally from adult stem cells in diverse tissues. Because hundred critical synchronized responsive retrovirus and stabilization mutation tr...
4338,SELENON,W453S,"Gastrointestinal stromal tumors (GIST) are characterized by a strong KIT receptor activation most often resulting from KIT mutations. In a smaller subgroup of tumors without KIT mutations, analogo..."
1067,HCN4,D553N,"Mutations in the VHL tumor suppressor gene result in constitutive expression of many hypoxia-inducible genes, at least in part because of increases in the cellular level of hypoxia-inducible trans..."
2180,SCN9A,V1298F,Inactivating germline mutations of the CDKN1B gene encoding the nuclear cyclin-dependent kinase inhibitor P27kip1 protein have been reported in patients with multiple endocrine neoplasia type 4 (M...
260,ADAMTS10,G518D,"Hedgehog signaling is essential for tissue development and stemness, and its deregulation has been observed in many tumors. Aberrant activation of Hedgehog signaling is the result of genetic mutat..."
1382,SQSTM1,P392L,A total of 500 unselected unrelated neurofibromatosis 1 (NF1) patients were screened for deletions of the NF1 gene. PL to to P392L of P392L as target amino endogenous P392L most proliferation 100 ...


# Conclusion

The imbalance between the data classes is evident, and correction is necessary to avoid overfitting during training.  
Missing data was removed because it represented a small percentage. In the test data $\approx 0.02$% and in the training data $\approx 0.15$% of missing data.

# Removing punctuations

In [None]:
data_for_train.head(2)

Unnamed: 0,gene,variation,clinical_evidence,class
0,FAM58A,Truncating_Mutations,Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kina...,1
1,CBL,W802*,Abstract Background Non-small cell lung cancer (NSCLC) is a heterogeneous group of disorders with a number of genetic and proteomic alterations. c-CBL is an E3 ubiquitin ligase and adaptor molec...,2


In [None]:
data_for_predict.head(2)

Unnamed: 0,gene,variation,clinical_evidence
0,ACSL4,R570S,"2. This mutation resulted in a myeloproliferative phenotype, including erythrocytosis, in a murine model of retroviral bone marrow transplantation. CONCLUSIONS JAK2 exon 12 mutations define a di..."
1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1) is a serine/threonine kinase and tumor suppressor found down-regulated in various human cancers. LATS1 has recently been identified as a central play...


In [None]:
data_for_train['clinical_evidence'] = data_for_train['clinical_evidence'].str.replace(r"[-();./,\[\]=%_–⇓:‘’]", "", regex=True).str.strip().str.lower()

In [None]:
data_for_train.head(2)

Unnamed: 0,gene,variation,clinical_evidence,class
0,FAM58A,Truncating_Mutations,cyclindependent kinases cdks regulate a variety of fundamental cellular processes cdk10 stands out as one of the last orphan cdks for which no activating cyclin has been identified and no kinase a...,1
1,CBL,W802*,abstract background nonsmall cell lung cancer nsclc is a heterogeneous group of disorders with a number of genetic and proteomic alterations ccbl is an e3 ubiquitin ligase and adaptor molecule im...,2


In [None]:
data_for_predict['clinical_evidence'] = data_for_predict['clinical_evidence'].str.replace(r"[-();./,\[\]=%_–⇓:‘’]", "", regex=True).str.strip().str.lower()
# pyperclip.copy(reg)

In [None]:
data_for_predict.head(2)

Unnamed: 0,gene,variation,clinical_evidence
0,ACSL4,R570S,2 this mutation resulted in a myeloproliferative phenotype including erythrocytosis in a murine model of retroviral bone marrow transplantation conclusions jak2 exon 12 mutations define a distin...
1,NAGLU,P521L,abstract the large tumor suppressor 1 lats1 is a serinethreonine kinase and tumor suppressor found downregulated in various human cancers lats1 has recently been identified as a central player of ...


# Save on HD

In [None]:
data_for_train.to_parquet('./data_files/data_for_train.parquet', compression='brotli')

In [None]:
data_for_predict.to_parquet('./data_files/data_for_predict.parquet', compression='brotli')