In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
from nltk.corpus import stopwords
import spacy
nlp = spacy.load("en_core_web_lg")
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from scipy.sparse import csr_matrix

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.metrics.classification import accuracy_score , log_loss
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from mlxtend.classifier import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier



In [4]:
# Train
train_var = pd.read_csv(r"C:\Users\admin\Desktop\Datasets\Personalized Medicine Redefining Cancer Treatment\training_variants",
                 delimiter="," , sep="\t")
train_txt = pd.read_csv(r"C:\Users\admin\Desktop\Datasets\Personalized Medicine Redefining Cancer Treatment\training_text",
                  sep="\|\|",engine="python",skiprows=1,names=["ID","Text"])
# Test
test_var = pd.read_csv(r"C:\Users\admin\Desktop\Datasets\Personalized Medicine Redefining Cancer Treatment\test_variants",
                 delimiter="," , sep="\t")
test_txt = pd.read_csv(r"C:\Users\admin\Desktop\Datasets\Personalized Medicine Redefining Cancer Treatment\test_text",
                  sep="\|\|",engine="python",skiprows=1,names=["ID","Text"])

In [5]:
train_var.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [6]:
train_txt.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
test_var.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [8]:
test_txt.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [9]:
print(train_var.shape)
print(train_txt.shape)

print(test_var.shape)
print(test_txt.shape)

(3321, 4)
(3321, 2)
(5668, 3)
(5668, 2)


In [10]:
train = pd.concat( [train_var , train_txt["Text"]] , axis=1)
train = train[["ID","Gene","Variation","Text","Class"]]
train.head()

Unnamed: 0,ID,Gene,Variation,Text,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,4


In [11]:
test = pd.concat( [test_var , test_txt["Text"]] , axis=1)
test = test[["ID","Gene","Variation","Text"]]
test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [12]:
print(train.shape)
print(test.shape)

(3321, 5)
(5668, 4)


In [13]:
# Cleaning and filling

In [14]:
# Train

In [15]:
train.isnull().sum()

ID           0
Gene         0
Variation    0
Text         5
Class        0
dtype: int64

In [16]:
train[train["Text"].isnull()]

Unnamed: 0,ID,Gene,Variation,Text,Class
1109,1109,FANCA,S1088F,,1
1277,1277,ARID5B,Truncating Mutations,,1
1407,1407,FGFR3,K508M,,6
1639,1639,FLT1,Amplification,,6
2755,2755,BRAF,G596C,,7


In [17]:
train.loc[ train["Text"].isnull() , "Text" ] = train["Gene"] + " " + train["Variation"]

In [18]:
train.isnull().sum()

ID           0
Gene         0
Variation    0
Text         0
Class        0
dtype: int64

In [19]:
# Finding space filled text data
lst=[]
for i,id,gene,var,txt,cl in train.itertuples():
      if type(var) == str:
            if txt.isspace():
                  lst.append(i)
lst
# Hence no space filled data

[]

In [20]:
# Test

In [21]:
test.isnull().sum()

ID           0
Gene         0
Variation    0
Text         1
dtype: int64

In [22]:
test[test["Text"].isnull()]

Unnamed: 0,ID,Gene,Variation,Text
1623,1623,AURKB,Amplification,


In [23]:
test.loc[ test["Text"].isnull() , "Text" ] = test["Gene"] + " " + test["Variation"]

In [24]:
# Finding space filled text data
lst=[]
for i,id,gene,var,txt in test.itertuples():
      if type(var) == str:
            if txt.isspace():
                  lst.append(i)
lst
# Hence no space filled data

[]

In [34]:
# Now NLP stopword removal and lemmatization can be proceeded in two ways
# Either by NLTK or by Spacy
# Any of the two can be applied

In [35]:
# NLTK Method

In [8]:
# Word Lemmatization tagging function
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [9]:
# On Train Data
corpus_train = []
lemmatizer = WordNetLemmatizer()
for i in range (len(train)):
    if (i%100==0):
        print (i)
    review = re.sub('[^a-zA-Z0-9]' , " " , train["Text"][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    review = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in review]
    review = " ".join(review)
    corpus_train.append(review)

0


KeyboardInterrupt: 

In [None]:
# On Test Data
corpus_test = []
lemmatizer = WordNetLemmatizer()
for i in range (len(test)):
    if (i%50==0):
        print (i)
    review = re.sub('[^a-zA-Z0-9]' , " " , train["Text"][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    review = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in review]
    review = " ".join(review)
    corpus_test.append(review)

In [36]:
# Spacy method

In [None]:
# Spacy stopword and lemmetization 
# Train Data
corpus_train = []
for i in range (len(train)):
    if (i%100==0):
        print (i)
    review = re.sub('[^a-zA-Z0-9]' , " " , train["Text"][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    review = nlp(review)
    review = [token.lemma_ for token in review]
    review = " ".join(review)
    corpus_train.append(review)

In [None]:
# Spacy stopword and lemmetization 
# Test Data
corpus_train = []
for i in range (len(train)):
    if (i%100==0):
        print (i)
    review = re.sub('[^a-zA-Z0-9]' , " " , train["Text"][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    review = nlp(review)
    review = [token.lemma_ for token in review]
    review = " ".join(review)
    corpus_train.append(review)

In [None]:
# Putting this new clean text to original file

In [None]:
# Train Data
Text = pd.DataFrame ( { "Text" : corpus_train })
train.drop(["Text"],inplace=True,axis=1)
train = pd.concat ( [train , Text] , axis=1)
train = train[["ID","Gene","Variation","Text","Class"]]
train.head()

In [None]:
# Test Data
Text = pd.DataFrame ( { "Text" : corpus_test })
test.drop(["Text"],inplace=True,axis=1)
test = pd.concat ( [test , Text] , axis=1)
test = test[["ID","Gene","Variation","Text"]]
test.head()

In [None]:
# Adding all the other tho column words to main text file

In [None]:
# Train Data

In [None]:
#Converting all "gene" and "variation" to standard format
for i in range (len(train)):
    review_gene = re.sub("[^a-zA-Z0-9]" , " " , train["Gene"][i])
    review_var = re.sub("[^a-zA-Z0-9]" , " " , train["Variation"][i])
    review_gene = review_gene.lower()
    review_var = review_var.lower()
    train.loc[i,"Gene"] = review_gene
    train.loc[i,"Variation"] = review_var

In [None]:
# Taking all words to main text
df =  train["Gene"] + " " + train["Variation"] + " " + train["Text"]
df = pd.DataFrame({"Text" : df})
train.drop(["Text"],axis=1,inplace=True)
train = pd.concat( [train,df] , axis=1)
train = train[["ID","Gene","Variation","Text","Class"]]
train.drop(["ID","Gene","Variation"] , axis=1 , inplace=True)
train.head()

In [None]:
# Test Data

In [None]:
#Converting all "gene" and "variation" to standard format
for i in range (len(test)):
    review_gene = re.sub("[^a-zA-Z0-9]" , " " , test["Gene"][i])
    review_var = re.sub("[^a-zA-Z0-9]" , " " , test["Variation"][i])
    review_gene = review_gene.lower()
    review_var = review_var.lower()
    test.loc[i,"Gene"] = review_gene
    test.loc[i,"Variation"] = review_var

In [None]:
# Taking all words to main text
df =  test["Gene"] + " " + test["Variation"] + " " + test["Text"]
df = pd.DataFrame({"Text" : df})
test.drop(["Text"],axis=1,inplace=True)
test = pd.concat( [test,df] , axis=1)
test = test[["ID","Gene","Variation","Text"]]
test.drop(["ID","Gene","Variation"] , axis=1 , inplace=True)
test.head()

In [4]:
# Importing cleaned data

In [4]:
train = pd.read_csv(r"C:\Users\admin\Desktop\Datasets\Personalized Medicine Redefining Cancer Treatment\train_clean.csv")
test = pd.read_csv(r"C:\Users\admin\Desktop\Datasets\Personalized Medicine Redefining Cancer Treatment\test_clean.csv")

In [5]:
train.head()

Unnamed: 0,Gene,Variation,Text,Class
0,fam58a,truncating mutations,cyclin dependent kinase cdks regulate variety ...,1
1,cbl,w802,abstract background non small cell lung cancer...,2
2,cbl,q249e,abstract background non small cell lung cancer...,2
3,cbl,n454d,recent evidence demonstrate acquire uniparenta...,3
4,cbl,l399v,oncogenic mutation monomeric casitas b lineage...,4


In [6]:
test.head()

Unnamed: 0,Gene,Variation,Text
0,acsl4,r570s,2 mutation result myeloproliferative phenotype...
1,naglu,p521l,abstract large tumor suppressor 1 lats1 serine...
2,pah,l333f,vascular endothelial growth factor receptor ve...
3,ing1,a148d,inflammatory myofibroblastic tumor imt neoplas...
4,tmem216,g77a,abstract retinoblastoma pediatric retinal tumo...


In [7]:
print(train.shape)
print(test.shape)

(3321, 4)
(5668, 3)


In [8]:
data = pd.concat([train["Text"] , test["Text"]])
data = pd.DataFrame({"Text" : data})
data.head()

Unnamed: 0,Text
0,cyclin dependent kinase cdks regulate variety ...
1,abstract background non small cell lung cancer...
2,abstract background non small cell lung cancer...
3,recent evidence demonstrate acquire uniparenta...
4,oncogenic mutation monomeric casitas b lineage...


In [9]:
data.shape

(8989, 1)

In [10]:
vectorizer = TfidfVectorizer()
data_vec = vectorizer.fit_transform(data["Text"])
data_vec

<8989x159178 sparse matrix of type '<class 'numpy.float64'>'
	with 13662172 stored elements in Compressed Sparse Row format>

In [11]:
train_vec , test_vec = train_test_split(data_vec , train_size=3321 , shuffle=False)

In [12]:
train_vec

<3321x159178 sparse matrix of type '<class 'numpy.float64'>'
	with 4709265 stored elements in Compressed Sparse Row format>

In [13]:
test_vec

<5668x159178 sparse matrix of type '<class 'numpy.float64'>'
	with 8952907 stored elements in Compressed Sparse Row format>

In [14]:
print(train_vec.shape)
print(test_vec.shape)

(3321, 159178)
(5668, 159178)


In [15]:
print(type(train_vec))

<class 'scipy.sparse.csr.csr_matrix'>


In [16]:
# Remember here train data do not have "Class" column and that both train and test data are sparse matrices

In [17]:
x_train , x_test , y_train , y_test = train_test_split (train_vec , train["Class"] , test_size=0.3 , random_state=42)

In [18]:
print (x_train.shape)
print (y_train.shape)

print (x_test.shape)
print (y_test.shape)

(2324, 159178)
(2324,)
(997, 159178)
(997,)


In [19]:
# Analysis

In [20]:
# MultinomialNB
clf = MultinomialNB()
clf.fit (x_train , y_train)
prediction1 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.4663991975927783

In [21]:
# Support Vector Machine
clf = SVC()
clf.fit (x_train , y_train)
prediction2 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6479438314944834

In [22]:
# KNN
clf = KNeighborsClassifier()
clf.fit (x_train , y_train)
prediction3 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6108324974924775

In [23]:
# Decision Tree
clf = DecisionTreeClassifier()
clf.fit (x_train , y_train)
prediction4 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.5476429287863591

In [24]:
# Random Forest
clf = RandomForestClassifier()
clf.fit (x_train , y_train)
prediction5 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6359077231695085

In [25]:
# Logistic Regression
clf = LogisticRegression()
clf.fit (x_train , y_train)
prediction6 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6248746238716149

In [26]:
# XGBoost
clf = xgb.XGBClassifier()
clf.fit (x_train , y_train)
prediction7 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6579739217652959

In [27]:
# SGDClassifier
clf = SGDClassifier()
clf.fit (x_train , y_train)
prediction8 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6409227683049148

In [28]:
# Extra Trees Classifier
clf = ExtraTreesClassifier()
clf.fit (x_train , y_train)
prediction9 = clf.predict ( test_vec )
metrics.accuracy_score ( y_test , clf.predict( x_test ) )

0.6359077231695085

In [114]:
final_pred = []
#final_pred.append(prediction1)
final_pred.append(prediction2)
final_pred.append(prediction3)
#final_pred.append(prediction4)
final_pred.append(prediction5)
final_pred.append(prediction6)
final_pred.append(prediction7)
final_pred.append(prediction8)
final_pred.append(prediction9)

prediction = []

for i in range (len(final_pred[0])):
    score = {1:0 , 2:0 , 3:0 , 4:0 , 5:0 , 6:0 , 7:0 , 8:0 ,9:0 }
    for j in range (len(final_pred)):
        score[final_pred[j][i]] = score[final_pred[j][i]] + 1
    
    max = score[1]
    for k in range(2,10):
        if (max<score[k]):
            max = score[k]
            
    prediction.append(max)

In [115]:
pred = pd.DataFrame({"Prediction" : prediction})
pred = pd.get_dummies(pred["Prediction"])
lst1 = np.zeros((5668) , dtype=int)
lst1 = pd.DataFrame({1 : lst1})
lst8 = np.zeros((5668) , dtype=int)
lst8 = pd.DataFrame({8 : lst8})
lst9 = np.zeros((5668) , dtype=int)
lst9 = pd.DataFrame({9 : lst9})
Prediction = pd.concat([lst1 , pred , lst8 , lst9] , axis=1)
Prediction.rename(columns={1:"class1",2:"class2",3:"class3",4:"class4",5:"class5",6:"class6",7:"class7",8:"class8",9:"class9"} , inplace=True)

ID = np.arange(1,5669,1)
Id = pd.DataFrame({"ID" : ID})
p = pd.concat([Id , Prediction] , axis=1)
p.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,0


In [119]:
p.to_csv(r"C:\Users\admin\Desktop\submission.csv" , index=False , header=True)