In [27]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import re

In [2]:
#loading training variants, its a CSV
data_variants = pd.read_csv('/home/sahil/Desktop/training/training_variants')
#loading traing text dataset which is seperated by ||
data_text = pd.read_csv('/home/sahil/Desktop/training/training_text',sep='\|\|',engine="python",names=["ID","TEXT"],skiprows=1)

In [5]:
# ID : row id used to link the mutation to the clinical evidence
# Gene : the gene where this genetic mutation is located
# Variation : the aminoacid change for this mutations
# Class : class value 1-9, this genetic mutation has been classified on
data_variants.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [6]:
data_variants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         3321 non-null   int64 
 1   Gene       3321 non-null   object
 2   Variation  3321 non-null   object
 3   Class      3321 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 103.9+ KB


In [7]:
data_variants.describe()

Unnamed: 0,ID,Class
count,3321.0,3321.0
mean,1660.0,4.365854
std,958.834449,2.309781
min,0.0,1.0
25%,830.0,2.0
50%,1660.0,4.0
75%,2490.0,7.0
max,3320.0,9.0


In [9]:
#check the dimensions
data_variants.shape

(3321, 4)

In [13]:
#columns of data variants
data_variants.columns

Index(['ID', 'Gene', 'Variation', 'Class'], dtype='object')

In [14]:
data_text.columns

Index(['ID', 'TEXT'], dtype='object')

In [16]:
# data_variants (ID, Gene, Variations, Class)
# data_text(ID, text)
data_text.shape

(3321, 2)

In [17]:
#As we can see from the result that its a multi-classification problem
data_variants.Class.unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [22]:
#the next step is to remove stopwords like an,the,a,is
#we are using stopwords from the nltk library to perform this task
stop_words = set(stopwords.words('english'))

In [25]:
def data_text_preprocess(total_text, ind, col):
    # Remove int values from text data as that might not be imp
    if type(total_text) is not int:
        string = ""
        # replacing all special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replacing multiple spaces with single space
        total_text = re.sub('\s+',' ', str(total_text))
        # bring whole text to same lower-case scale.
        total_text = total_text.lower()
        
        for word in total_text.split():
        # if the word is a not a stop word then retain that word from text
            if not word in stop_words:
                string += word + " "
        
        data_text[col][ind] = string

In [28]:
# Below code will take some time because its huge text (took 4 minute on my 16 GB RAM system), so run it and have a cup of coffee :)
for index, row in data_text.iterrows():
    if type(row['TEXT']) is str:
        data_text_preprocess(row['TEXT'], index, 'TEXT')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
#lets merge both gene variation and text data on id
result = pd.merge(data_variants,data_text,how='left',on='ID')

In [30]:
result.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [31]:
#its very important to look for NULL values as it will create a problem
result[result.isnull().any(axis=1)]

Unnamed: 0,ID,Gene,Variation,Class,TEXT
1109,1109,FANCA,S1088F,1,
1277,1277,ARID5B,Truncating Mutations,1,
1407,1407,FGFR3,K508M,6,
1639,1639,FLT1,Amplification,6,
2755,2755,BRAF,G596C,7,


In [None]:
#as we can see many empty values so lets perform imputation by 
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation']