PERSONALIZED CANCER DIAGNOSIS

**Problem Statement :**
Classify the given genetic variations/mutations based on evidence from text-based clinical literature.

**Objective:**
Predict the probability of each data-point belonging to each of the nine classes.

**Constraints:**
1. Interpretability 
2. Class probabilities are needed. 
3. Penalize the errors in class probabilites => Metric is Log-loss.
4. No Latency constraints.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
import numpy as np
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os

## READ THE TRAINING VARIANTS DATA

In [None]:
data = pd.read_csv('../input/cancer-diagonsis/training_variants')
print('Number of data points : ', data.shape[0])
print('Number of features : ', data.shape[1])
print('Features : ', data.columns.values)
data.head()

training/training_variants is a comma separated file containing the description of the genetic mutations used for training.
Fields are



1.   ID : the id of the row used to link the mutation to the clinical evidence
2.   Gene : the gene where this genetic mutation is located
3.   Variation : the aminoacid change for this mutations
4.   Class : 1-9 the class this genetic mutation has been classified on





## READ THE TEXT DATA

In [None]:
data_text =pd.read_csv("../input/cancer-diagonsis/training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
print('Number of data points : ', data_text.shape[0])
print('Number of features : ', data_text.shape[1])
print('Features : ', data_text.columns.values)
data_text.head()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

## APPLY NLP PREPROCESSING TASK 

In [None]:
def nlp_preprocessing(total_text,index,column):  
  if type(total_text) is not int:
    string = ""

    # REPLACE EVERY SPECIAL CHARACTER WITH THE SPACE
    total_text=re.sub('[^a-zA-Z0-9\n]',' ',total_text)  

    # REPLACE MULTIPLE SPACES WITH SINGLE SPACE
    total_text=re.sub('\s+',' ', total_text)

    # CONVERT ALL THE CHARACTER TO LOWER CASE
    total_text=total_text.lower()

    for word in total_text.split() :
      if not word in stop_words:    # IF THE WORD IS NOT STOP WORD THEN RETAIN THAT WORD AND ASSINGN IN STRING VARIABLE
        string+=word + " "
    data_text[column][index] = string

In [None]:
start_time = time.clock()

for index,row in data_text.iterrows():
  if type(row['TEXT']) is str:
    nlp_preprocessing(row['TEXT'],index,'TEXT')
  else:
    print('There is no text description for id :',index)
print('Time : ',time.clock() - start_time,"seconds")

In [None]:
# MERGE THE DATA (GENE AND VARIATIONS) & TEXT DATA BASED ON THE ID
final_data = pd.merge(data,data_text,on='ID',how='left')

final_data.head()

In [None]:
final_data[final_data.isnull().any(axis=1)]

In [None]:
final_data.loc[final_data['TEXT'].isnull(),'TEXT'] = final_data['Gene'] +' '+final_data['Variation']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


Y_Class=final_data['Class'].values
final_data.Gene=final_data.Gene.str.replace('\s+','_')
final_data.Variation=final_data.Variation.str.replace('\s+','_')

# SPLIT THE DATA INTO TEST,TRAIN AND CV
X_1,X_Test,Y_1,Y_Test=train_test_split(final_data,Y_Class,stratify=Y_Class,test_size=0.2)
X_Train,X_CV,Y_Train,Y_CV=train_test_split(X_1,Y_1,test_size=0.2)

In [None]:
print('Number of data points in train data:', X_Train.shape[0])
print('Number of data points in test data:', X_Train.shape[0])
print('Number of data points in cross validation data:', X_CV.shape[0])

In [None]:
def plot_confusion_matrix(test_y,predict_y):
  C = confusion_matrix(test_y,predict_y)

  A=(((C.T)/(C.sum(axis=1))).T)

  B = (C/C.sum(axis=0))

  labels = [1,2,3,4,5,6,7,8,9]

  print("-"*20,"Confusion Matrix","-"*20)
  plt.figure(figsize = (20,7))

  sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
  plt.xlabel('Predicted Class')
  plt.ylabel('Original Class')
  plt.show()

  print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20)
  plt.figure(figsize=(20,7))
  sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
  plt.xlabel('Predicted Class')
  plt.ylabel('Original Class')
  plt.show()

  print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
  plt.figure(figsize=(20,7))
  sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
  plt.xlabel('Predicted Class')
  plt.ylabel('Original Class')
  plt.show()

In [None]:
from sklearn.metrics import log_loss
import seaborn as sns
X_Test_len = X_Test.shape[0]
X_CV_len = X_CV.shape[0]

Y_CV_Predicted = np.zeros((X_CV_len,9))

for i in range(X_CV_len):
  rand_probs=np.random.rand(1,9)
  Y_CV_Predicted[i] =((rand_probs/sum(sum(rand_probs)))[0])
print("Log Loss on CV using Random Model",log_loss(Y_CV, Y_CV_Predicted,eps=1e-15))


Y_Test_Predicted = np.zeros((X_Test_len,9))

for i in range(X_Test_len ):
  rand_probs=np.random.rand(1,9)
  Y_Test_Predicted[i] =((rand_probs/sum(sum(rand_probs)))[0])
print("Log Loss on CV using Random Model",log_loss(Y_Test, Y_Test_Predicted,eps=1e-15))

predicted_y =np.argmax(Y_Test_Predicted, axis=1)
plot_confusion_matrix(Y_Test, predicted_y+1)

#UNIVARIATE ANALYSIS

In [None]:
def get_gene_variation_feature_dic(alpha,feature,df):
  
  value_count=df[feature].value_counts()
  print("Value Count :", value_count)
  gene_var = dict()

  for i,denominator in value_count.items():
    vec=[]

    for k in range(1,10):
      class_count=df.loc[(df['Class'] == k) & (df[feature] == i)]
      vec.append((class_count.shape[0] + alpha*10)/ (denominator + 90*alpha))   #Laplace Smoothing
    gene_var[i]=vec
  return gene_var

In [None]:
def get_gene_variation_features(alpha,feature,df):

  gv_dict=get_gene_variation_feature_dic(alpha,feature,df)

  value_count=df[feature].value_counts()

  gv_fea=[]
  print("DF Iteration_rows", df.iterrows())
  for index,row in df.iterrows():
    if row[feature] in dict(value_count).keys():
      gv_fea.append(gv_dict[row[feature]])
    else:
      gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])
  return gv_fea

## Univariate Analysis on Gene Features

In [None]:
unique_genes=X_Train['Gene'].value_counts()

print("Number of Unique Genes :",unique_genes.shape[0])

print(unique_genes.head(10))

### Looking at the count , looks like there are 236 different categories of Gene thats are in Training Data

### Distribution are as follows

In [None]:
s = sum(unique_genes.values);
h = unique_genes.values/s;
plt.plot(h, label="Histrogram of Genes")
plt.xlabel('Index of a Gene')
plt.ylabel('Number of Occurances')
plt.legend()
plt.grid()
plt.show()

In [None]:
c = np.cumsum(h)
plt.plot(c,label='Cumulative distribution of Genes')
plt.grid()
plt.legend()
plt.show()

#### There are 2 ways we can featurize this variable.



1.   One Hot Encoding 
2.   Response Coding

We will choose the appropriate featurization based on the ML model we use.



# BAG OF WORDS VECTORIZATION TECHNIQUE 

### Response Coding Method on Gene Feature

In [None]:
alpha = 1 

X_Train_gene_Feature_responsecoding = np.array(get_gene_variation_features(alpha,"Gene",X_Train))
print("Train Gene Feature :",X_Train_gene_Feature_responsecoding.shape)

print("="*100)

X_Test_gen_Feature_responsecoding = np.array(get_gene_variation_features(alpha,"Gene",X_Test))
print("Test Gene Feature :",X_Test_gen_Feature_responsecoding.shape)

print("="*100)

X_CV_gene_Feature_responsecoding = np.array(get_gene_variation_features(alpha,"Gene",X_CV))
print("CV Gene Feature :",X_CV_gene_Feature_responsecoding.shape)

print("="*100)

### One Hot Encoding Method on Gene feature

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
gene_vectorizer=CountVectorizer()

X_Train_gene_Feature_onehotEncoding=gene_vectorizer.fit_transform(X_Train["Gene"])
print(" Train Gene Feature :" ,X_Train_gene_Feature_onehotEncoding.shape)

print("="*100)

X_Test_gene_Feature_onehotencoding=gene_vectorizer.transform(X_Test["Gene"])
print(" Test Gene Feature :" ,X_Test_gene_Feature_onehotencoding.shape)

print("="*100)


X_CV_gene_Feature_onehotencoding=gene_vectorizer.transform(X_CV["Gene"])
print(" CV Gene Feature :" ,X_CV_gene_Feature_onehotencoding.shape)

print("="*100)

### APPLY SVM -->  SGD CLASSIFIER TO FIND THE BEST HYPERPARAMETER

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 1)]

cv_log_error=[]

for i in alpha :
  clf=SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
  clf.fit(X_Train_gene_Feature_onehotEncoding,Y_Train)
  sig_clf=CalibratedClassifierCV(clf,method="sigmoid")
  sig_clf.fit(X_Train_gene_Feature_onehotEncoding,Y_Train)
  Predicted_Y=sig_clf.predict_proba(X_CV_gene_Feature_onehotencoding)
  cv_log_error.append(log_loss(Y_CV,Predicted_Y,labels=clf.classes_,eps=1e-15))
  print('For values of alpha = ', i, "The log loss is:",log_loss(Y_CV, Predicted_Y, labels=clf.classes_, eps=1e-15))

fig,ax = plt.subplots()
ax.plot(alpha,cv_log_error,c='g')

for i , txt in enumerate(np.round(cv_log_error,3)):
  ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error Measure")
plt.show()

best_alpha = np.argmin(cv_log_error)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_gene_Feature_onehotEncoding, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_gene_Feature_onehotEncoding, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_gene_Feature_onehotEncoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_gene_Feature_onehotencoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_gene_Feature_onehotencoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
print("Q6. How many data points in Test and CV datasets are covered by the ", unique_genes.shape[0], " genes in train dataset?")

test_coverage=X_Test[X_Test['Gene'].isin(list(set(X_Train['Gene'])))].shape[0]
cv_coverage=X_CV[X_CV['Gene'].isin(list(set(X_Train['Gene'])))].shape[0]

print('Ans\n1. In test data',test_coverage, 'out of',X_Test.shape[0], ":",(test_coverage/X_Test.shape[0])*100)
print('2. In cross validation data',cv_coverage, 'out of ',X_CV.shape[0],":" ,(cv_coverage/X_CV.shape[0])*100)

## Univariate Analysis on Variation Features

In [None]:
unique_variations=X_Train['Variation'].value_counts()

print("Number of Unique Genes :",unique_variations.shape[0])

print(unique_variations.head(10))

In [None]:
s = sum(unique_variations.values);
h = unique_variations.values/s;
plt.plot(h, label="Histrogram of Variations")
plt.xlabel('Index of a Vriations')
plt.ylabel('Number of Occurances')
plt.legend()
plt.grid()
plt.show()

In [None]:
c = np.cumsum(h)
print(c)
plt.plot(c,label='Cumulative distribution of Variations')
plt.grid()
plt.legend()
plt.show()

#### There are 2 ways we can featurize this variable.



1.   One Hot Encoding 
2.   Response Coding

We will choose the appropriate featurization based on the ML model we use.

### Response Coding Method on Variation Features

In [None]:
alpha = 1 

X_Train_variation_Feature_responsecoding = np.array(get_gene_variation_features(alpha,"Variation",X_Train))
print("Train Variation Feature :",X_Train_variation_Feature_responsecoding.shape)

print("="*100)

X_Test_variation_Feature_responsecoding = np.array(get_gene_variation_features(alpha,"Variation",X_Test))
print("Test Variation Feature :",X_Test_variation_Feature_responsecoding.shape)

print("="*100)

X_CV_variation_Feature_responsecoding = np.array(get_gene_variation_features(alpha,"Variation",X_CV))
print("CV Variation Feature :",X_CV_variation_Feature_responsecoding.shape)

print("="*100)

### One Hot Encoding on Variation Feature

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
variation_vectorizer=CountVectorizer()

X_Train_variation_Feature_onehotEncoding=variation_vectorizer.fit_transform(X_Train["Variation"])
print(" Train Variation Feature :" ,X_Train_variation_Feature_onehotEncoding.shape)

print("="*100)

X_Test_variation_Feature_onehotencoding=variation_vectorizer.transform(X_Test["Variation"])
print(" Test Variation Feature :" ,X_Test_variation_Feature_onehotencoding.shape)

print("="*100)


X_CV_variation_Feature_onehotencoding=variation_vectorizer.transform(X_CV["Variation"])
print(" CV Variation Feature :" ,X_CV_variation_Feature_onehotencoding.shape)

print("="*100)

### APPLY SVM -->  SGD CLASSIFIER TO FIND THE BEST HYPERPARAMETER

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 1)]

cv_log_error=[]

for i in alpha :
  clf=SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
  clf.fit(X_Train_variation_Feature_onehotEncoding,Y_Train)
  sig_clf=CalibratedClassifierCV(clf,method="sigmoid")
  sig_clf.fit(X_Train_variation_Feature_onehotEncoding,Y_Train)
  Predicted_Y=sig_clf.predict_proba(X_CV_variation_Feature_onehotencoding)
  cv_log_error.append(log_loss(Y_CV,Predicted_Y,labels=clf.classes_,eps=1e-15))
  print('For values of alpha = ', i, "The log loss is:",log_loss(Y_CV, Predicted_Y, labels=clf.classes_, eps=1e-15))

fig,ax = plt.subplots()
ax.plot(alpha,cv_log_error,c='g')

for i , txt in enumerate(np.round(cv_log_error,3)):
  ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error Measure")
plt.show()

best_alpha = np.argmin(cv_log_error)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_variation_Feature_onehotEncoding, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_variation_Feature_onehotEncoding, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_variation_Feature_onehotEncoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_variation_Feature_onehotencoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_variation_Feature_onehotencoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
test_coverage=X_Test[X_Test['Variation'].isin(list(set(X_Train['Variation'])))].shape[0]
cv_coverage=X_CV[X_CV['Variation'].isin(list(set(X_Train['Variation'])))].shape[0]

print('1. In test data',test_coverage, 'out of',X_Test.shape[0], ":",(test_coverage/X_Test.shape[0])*100)
print('2. In cross validation data',cv_coverage, 'out of ',X_CV.shape[0],":" ,(cv_coverage/X_CV.shape[0])*100)

## Univariate Analysis on Text Features

In [None]:
def extract_dictionary_paddle(cls_text):
    dictionary = defaultdict(int)
    for index, row in cls_text.iterrows():
        for word in row['TEXT'].split():
            dictionary[word] +=1
    return dictionary

In [None]:
import math

def get_text_responsecoding(df):
    text_feature_responseCoding = np.zeros((df.shape[0],9))
    for i in range(0,9):
        row_index = 0
        for index, row in df.iterrows():
            sum_prob = 0
            for word in row['TEXT'].split():
                sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90)))
            text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split()))
            row_index += 1
    return text_feature_responseCoding

### One Hot Encoding on Text Feature

In [None]:
text_vectorizer = CountVectorizer(min_df=3)
X_Train_feature_onehotCoding = text_vectorizer.fit_transform(X_Train['TEXT'])
# getting all the feature names (words)
X_Train_text_features= text_vectorizer.get_feature_names()

# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
train_text_fea_counts = X_Train_feature_onehotCoding.sum(axis=0).A1

# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
text_fea_dict = dict(zip(list(X_Train_text_features),train_text_fea_counts))


print("Total number of unique words in train data :", len(X_Train_text_features))

In [None]:
from collections import defaultdict
dict_list = []
# dict_list =[] contains 9 dictoinaries each corresponds to a class
for i in range(1,10):
    cls_text = X_Train[X_Train['Class']==i]
    # build a word dict based on the words in that class
    dict_list.append(extract_dictionary_paddle(cls_text))
    # append it to dict_list


total_dict = extract_dictionary_paddle(X_Train)


confuse_array = []
for i in X_Train_text_features:
    ratios = []
    max_val = -1
    for j in range(0,9):
        ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90))
    confuse_array.append(ratios)
confuse_array = np.array(confuse_array)

Response Coding on Text Feature

In [None]:
X_Train_text_feature_responseCoding  = get_text_responsecoding(X_Train)
print("Train Text Feature :", X_Train_text_feature_responseCoding.shape)

print("="*100)

X_Test_text_feature_responseCoding  = get_text_responsecoding(X_Test)
print("Test Text Feature :", X_Test_text_feature_responseCoding.shape)

print("="*100)
X_CV_text_feature_responseCoding  = get_text_responsecoding(X_CV)
print("CV Text Feature :", X_CV_text_feature_responseCoding.shape)

print("="*100)

In [None]:
X_Train_text_feature_responseCoding = (X_Train_text_feature_responseCoding.T/X_Train_text_feature_responseCoding.sum(axis=1)).T
X_Test_text_feature_responseCoding = (X_Test_text_feature_responseCoding.T/X_Test_text_feature_responseCoding.sum(axis=1)).T
X_CV_text_feature_responseCoding = (X_CV_text_feature_responseCoding.T/X_CV_text_feature_responseCoding.sum(axis=1)).T

### Normalize the features

In [None]:
from sklearn.preprocessing import normalize

X_Train_feature_onehotCoding = normalize(X_Train_feature_onehotCoding, axis=0)
print("Train Text Feature :", X_Train_feature_onehotCoding.shape)

print("="*100)

X_Test_text_feature_onehotCoding = text_vectorizer.transform(X_Test['TEXT'])
print("Test Text Feature :", X_Test_text_feature_onehotCoding.shape)

print("="*100)

X_Test_text_feature_onehotCoding = normalize(X_Test_text_feature_onehotCoding, axis=0)

X_CV_text_feature_onehotCoding = text_vectorizer.transform(X_CV['TEXT'])
print("CV Text Feature :", X_CV_text_feature_onehotCoding.shape)

print("="*100)

X_CV_text_feature_onehotCoding = normalize(X_CV_text_feature_onehotCoding, axis=0)

In [None]:
sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True))
sorted_text_occur = np.array(list(sorted_text_fea_dict.values())) 

### APPLY SVM -->  SGD CLASSIFIER TO FIND THE BEST HYPERPARAMETER

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
alpha = [10 ** x for x in range(-5, 1)]

cv_log_error=[]

for i in alpha :
  clf=SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
  clf.fit(X_Train_feature_onehotCoding,Y_Train)
  sig_clf=CalibratedClassifierCV(clf,method="sigmoid")
  sig_clf.fit(X_Train_feature_onehotCoding,Y_Train)
  Predicted_Y=sig_clf.predict_proba(X_CV_text_feature_onehotCoding)
  cv_log_error.append(log_loss(Y_CV,Predicted_Y,labels=clf.classes_,eps=1e-15))
  print('For values of alpha = ', i, "The log loss is:",log_loss(Y_CV, Predicted_Y, labels=clf.classes_, eps=1e-15))

fig,ax = plt.subplots()
ax.plot(alpha,cv_log_error,c='g')

for i , txt in enumerate(np.round(cv_log_error,3)):
  ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error Measure")
plt.show()

best_alpha = np.argmin(cv_log_error)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_feature_onehotCoding, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_feature_onehotCoding, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_text_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_text_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

## BUILDING A MACHINE LEARNING MODELS

In [None]:
def predict_and_plot_confusionmatrix(X_train,Y_train,X_test,Y_test,clf):
  clf.fit(X_train,Y_train)
  sig_clf=CalibratedClassifierCV(clf,method="sigmoid")
  sig_clf.fit(X_train,Y_train)
  pred_y=sig_clf.predict(X_test)


  print("Log Loss : ",log_loss(Y_test,sig_clf.predict_proba(X_test)))
  print("Number of Misclassified Points :",np.count_nonzero((pred_y - Y_test))/Y_test.shape[0])

  plot_confusion_matrix(Y_test,pred_y)

In [None]:
def report_log_loss(X_train,Y_train,X_test,Y_test,clf):
  clf.fit(X_Train,Y_train)
  sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
  sig_clf.fit(X_train,Y_train)
  sig_clf_probs=sig_clf.predict_proba(X_Test)
  return log_loss(Y_Test, sig_clf_probs, eps=1e-15)

## Get Feature Names 

In [None]:
def get_impfeature_names_tfidf(indices, text, gene, var, no_features):
    gene_count_vec = TfidfVectorizer()
    var_count_vec = TfidfVectorizer()
    text_count_vec = TfidfVectorizer(min_df=3)
    print ("Hello")
    gene_vec = gene_count_vec.fit(X_Train['Gene'])
    var_vec  = var_count_vec.fit(X_Train['Variation'])
    text_vec = text_count_vec.fit(X_Train['TEXT'])
    
    fea1_len = len(gene_vec.get_feature_names())
    fea2_len = len(var_count_vec.get_feature_names())
    
    word_present = 0
    for i,v in enumerate(indices):
        if (v < fea1_len):
            word = gene_vec.get_feature_names()[v]
            yes_no = True if word == gene else False
            if yes_no:
                word_present += 1
                print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no))
        elif (v < fea1_len+fea2_len):
            word = var_vec.get_feature_names()[v-(fea1_len)]
            yes_no = True if word == var else False
            if yes_no:
                word_present += 1
                print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no))
        else:
            word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
            yes_no = True if word in text.split() else False
            if yes_no:
                word_present += 1
                print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no))

    print("Out of the top ",no_features," features ", word_present, "are present in query point")

## STACKING THE THREE TYPES OF FEATURES

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import math
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")

from mlxtend.classifier import StackingClassifier

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import normalize
from collections import defaultdict
import math
from sklearn.metrics import log_loss
import seaborn as sns

In [None]:
X_Train_gene_var_onehotcoding = hstack((X_Train_gene_Feature_onehotEncoding,X_Train_variation_Feature_onehotEncoding))
X_Test_gene_var_onehotcoding =  hstack((X_Test_gene_Feature_onehotencoding,X_Test_variation_Feature_onehotencoding))
X_CV_gene_var_onehotcoding = hstack((X_CV_gene_Feature_onehotencoding,X_CV_variation_Feature_onehotencoding))

X_Train_onehotCoding = hstack((X_Train_gene_var_onehotcoding,X_Train_feature_onehotCoding)).tocsr()
print("Train One Hot Encoding  :", X_Train_onehotCoding.shape)
print("="*100)

X_Test_onehotcoding = hstack((X_Test_gene_var_onehotcoding,X_Test_text_feature_onehotCoding))
print("Test One Hot Encoding  :", X_Test_onehotcoding.shape)
print("="*100)

X_CV_onehotcoding = hstack((X_CV_gene_var_onehotcoding,X_CV_text_feature_onehotCoding))
print("CV One Hot Encoding  :", X_CV_onehotcoding.shape)
print("="*100)


X_Train_gene_var_responseCoding = np.hstack((X_Train_gene_Feature_responsecoding,X_Train_variation_Feature_responsecoding))
X_Test_gene_var_responseCoding = np.hstack((X_Test_gen_Feature_responsecoding,X_Test_variation_Feature_responsecoding))
X_CV_gene_var_responseCoding = np.hstack((X_CV_gene_Feature_responsecoding,X_CV_variation_Feature_responsecoding))

X_Train_responseCoding = np.hstack((X_Train_gene_var_responseCoding, X_Train_text_feature_responseCoding))
print("Train Response Coding  :", X_Train_responseCoding.shape)
print("="*100)

X_Test_responseCoding = np.hstack((X_Test_gene_var_responseCoding, X_Test_text_feature_responseCoding))
print("Test Response Coding  :", X_Test_responseCoding.shape)
print("="*100)
X_CV_responseCoding = np.hstack((X_CV_gene_var_responseCoding, X_CV_text_feature_responseCoding))
print("CV Response Coding  :", X_CV_responseCoding.shape)
print("="*100)

# BASE LINE MODEL

## NAIVE BAYES ALGORITHM

In [None]:
def Naive_Bayes_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test):

  alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
  cv_log_error=[]

  for i in alpha:
    print("for alpha =", i)
    clf = MultinomialNB(alpha=i)
    clf.fit(X_Train_onehotCoding,Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotCoding, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding)
    cv_log_error.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    #print("Log Loss :",log_loss(Y_CV, sig_clf_probs))

  fig, ax = plt.subplots()
  ax.plot(np.log10(alpha), cv_log_error,c='g')
  for i , txt in enumerate(np.round(cv_log_error,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error[i]))
  plt.grid()
  plt.xticks(np.log10(alpha))
  plt.title("Cross Validation Error for each alpha")
  plt.xlabel("Alpha i's")
  plt.ylabel("Error measure")
  plt.show()

  best_alpha = np.argmin(cv_log_error)
  clf = MultinomialNB(alpha=alpha[best_alpha])
  clf.fit(X_Train_onehotCoding, Y_Train)
  sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
  sig_clf.fit(X_Train_onehotCoding, Y_Train)

  predict_y = sig_clf.predict_proba(X_Train_onehotCoding)
  print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_CV_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_Test_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

  return alpha,best_alpha

In [None]:
alpha,best_alpha = Naive_Bayes_Algo(X_Train_onehotCoding,Y_Train,X_CV_onehotcoding,Y_CV,X_Test_onehotcoding,Y_Test)

In [None]:
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(X_Train_onehotCoding, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotCoding, Y_Train)
sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding)

print("Log Loss :",log_loss(Y_CV, sig_clf_probs))

print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(X_CV_onehotcoding)- Y_CV))/Y_CV.shape[0])
plot_confusion_matrix(Y_CV, sig_clf.predict(X_CV_onehotcoding.toarray()))

## K-NEAREST NEIGHBORS ALGORITHM 

## LOGISTIC REGRESSION (WITH CLASS BALANCING)

In [None]:
def Logistic_Regression_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test):

  alpha = [10 ** x for x in range(-6, 3)]
  cv_log_error=[]

  for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(X_Train_onehotCoding,Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotCoding, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding)
    cv_log_error.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs))

  fig, ax = plt.subplots()
  ax.plot(np.log10(alpha), cv_log_error,c='g')
  for i , txt in enumerate(np.round(cv_log_error,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error[i]))
  plt.grid()
  plt.xticks(np.log10(alpha))
  plt.title("Cross Validation Error for each alpha")
  plt.xlabel("Alpha i's")
  plt.ylabel("Error measure")
  plt.show()

  best_alpha = np.argmin(cv_log_error)
  clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
  clf.fit(X_Train_onehotCoding, Y_Train)
  sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
  sig_clf.fit(X_Train_onehotCoding, Y_Train)

  predict_y = sig_clf.predict_proba(X_Train_onehotCoding)
  print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_CV_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_Test_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

  return alpha,best_alpha

In [None]:
alpha,best_alpha = Logistic_Regression_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test)

In [None]:
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusionmatrix(X_Train_onehotCoding, Y_Train, X_CV_onehotcoding, Y_CV, clf)

## LOGISTIC REGRESSION WITHOUT CLASS BALANCING 

In [None]:
def Logistic_Regression_Algo_WithoutClassBalancing(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test):

  alpha = [10 ** x for x in range(-6, 3)]
  cv_log_error=[]

  for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(X_Train_onehotCoding,Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotCoding, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding)
    cv_log_error.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs))

  fig, ax = plt.subplots()
  ax.plot(np.log10(alpha), cv_log_error,c='g')
  for i , txt in enumerate(np.round(cv_log_error,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error[i]))
  plt.grid()
  plt.xticks(np.log10(alpha))
  plt.title("Cross Validation Error for each alpha")
  plt.xlabel("Alpha i's")
  plt.ylabel("Error measure")
  plt.show()

  best_alpha = np.argmin(cv_log_error)
  clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
  clf.fit(X_Train_onehotCoding, Y_Train)
  sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
  sig_clf.fit(X_Train_onehotCoding, Y_Train)

  predict_y = sig_clf.predict_proba(X_Train_onehotCoding)
  print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_CV_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_Test_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

  return alpha,best_alpha

In [None]:
alpha,best_alpha = Logistic_Regression_Algo_WithoutClassBalancing(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test)

In [None]:
clf = SGDClassifier( alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusionmatrix(X_Train_onehotCoding, Y_Train, X_CV_onehotcoding, Y_CV, clf)

## LINEAR SUPPORT VECTOR MACHINE

In [None]:
def LinearSVM_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test):

  alpha = [10 ** x for x in range(-6, 3)]
  cv_log_error=[]

  for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='hinge', random_state=42)
    clf.fit(X_Train_onehotCoding,Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotCoding, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding)
    cv_log_error.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs))

  fig, ax = plt.subplots()
  ax.plot(np.log10(alpha), cv_log_error,c='g')
  for i , txt in enumerate(np.round(cv_log_error,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error[i]))
  plt.grid()
  plt.xticks(np.log10(alpha))
  plt.title("Cross Validation Error for each alpha")
  plt.xlabel("Alpha i's")
  plt.ylabel("Error measure")
  plt.show()

  best_alpha = np.argmin(cv_log_error)
  clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42)
  clf.fit(X_Train_onehotCoding, Y_Train)
  sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
  sig_clf.fit(X_Train_onehotCoding, Y_Train)

  predict_y = sig_clf.predict_proba(X_Train_onehotCoding)
  print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_CV_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
  
  predict_y = sig_clf.predict_proba(X_Test_onehotcoding)
  print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

  return alpha,best_alpha

In [None]:
alpha,best_alpha = LinearSVM_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test)

## RANDOM FOREST CLASSIFIER

In [None]:
def Random_Forest_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test):
  alpha = [100,200,500,1000,2000]
  max_depth = [5, 10]
  cv_log_error = []
  for i in alpha:
      for j in max_depth:
          print("for n_estimators =", i,"and max depth = ", j)
          clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42, n_jobs=-1)
          clf.fit(X_Train_onehotCoding, Y_Train)
          sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
          sig_clf.fit(X_Train_onehotCoding, Y_Train)
          sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding)
          cv_log_error.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
          print("Log Loss :",log_loss(Y_CV, sig_clf_probs))   
  best_alpha = np.argmin(cv_log_error)
  clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
  clf.fit(X_Train_onehotCoding, Y_Train)
  sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
  sig_clf.fit(X_Train_onehotCoding, Y_Train)

  predict_y = sig_clf.predict_proba(X_Train_onehotCoding)
  print('For values of best estimator = ', alpha[int(best_alpha/2)], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
  predict_y = sig_clf.predict_proba(X_CV_onehotcoding)
  print('For values of best estimator = ', alpha[int(best_alpha/2)], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
  predict_y = sig_clf.predict_proba(X_Test_onehotcoding)
  print('For values of best estimator = ', alpha[int(best_alpha/2)], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))
  return alpha ,best_alpha,max_depth

In [None]:
alpha,best_alpha,max_depth = Random_Forest_Algo(X_Train,Y_Train,X_CV,Y_CV,X_Test,Y_Test)

In [None]:
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
predict_and_plot_confusionmatrix(X_Train_onehotCoding, Y_Train, X_CV_onehotcoding, Y_CV, clf)

# TF-IDF FEATURIZATION TECHNIQUE 

## TF-IDF on Gene Features 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
gene_vectorizer=TfidfVectorizer()

X_Train_gene_Feature_onehotencoding_tfidf=gene_vectorizer.fit_transform(X_Train["Gene"])
print(" Train Gene Feature :" ,X_Train_gene_Feature_onehotencoding_tfidf.shape)

print("="*100)

X_Test_gene_Feature_onehotencoding_tfidf=gene_vectorizer.transform(X_Test["Gene"])
print(" Test Gene Feature :" ,X_Test_gene_Feature_onehotencoding_tfidf.shape)

print("="*100)


X_CV_gene_Feature_onehotencoding_tfidf=gene_vectorizer.transform(X_CV["Gene"])
print(" CV Gene Feature :" ,X_CV_gene_Feature_onehotencoding_tfidf.shape)

print("="*100)

## TF-IDF on Variation Features

In [None]:
variation_vectorizer=TfidfVectorizer()

X_Train_variation_Feature_onehotencoding_tfidf=variation_vectorizer.fit_transform(X_Train["Variation"])
print(" Train Variation Feature :" ,X_Train_variation_Feature_onehotencoding_tfidf.shape)

print("="*100)

X_Test_variation_Feature_onehotencoding_tfidf=variation_vectorizer.transform(X_Test["Variation"])
print(" Test Variation Feature :" ,X_Test_variation_Feature_onehotencoding_tfidf.shape)

print("="*100)


X_CV_variation_Feature_onehotencoding_tfidf=variation_vectorizer.transform(X_CV["Variation"])
print(" CV Variation Feature :" ,X_CV_variation_Feature_onehotencoding_tfidf.shape)

print("="*100)

## TF-IDF on Text Features 

In [None]:
text_vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1,4),max_features=3000)
X_Train_feature_onehotencoding_tfidf = text_vectorizer.fit_transform(X_Train['TEXT'])
# getting all the feature names (words)

In [None]:
from sklearn.preprocessing import normalize

X_Train_feature_onehotencoding_tfidf = normalize(X_Train_feature_onehotencoding_tfidf, axis=0)
print("Train Text Feature :", X_Train_feature_onehotencoding_tfidf.shape)

print("="*100)

X_Test_text_feature_onehotencoding_tfidf = text_vectorizer.transform(X_Test['TEXT'])
print("Test Text Feature :", X_Test_text_feature_onehotencoding_tfidf.shape)

print("="*100)

X_Test_text_feature_onehotencoding_tfidf = normalize(X_Test_text_feature_onehotencoding_tfidf, axis=0)

X_CV_text_feature_onehotencoding_tfidf = text_vectorizer.transform(X_CV['TEXT'])
print("CV Text Feature :", X_CV_text_feature_onehotencoding_tfidf.shape)

print("="*100)

X_CV_text_feature_onehotencoding_tfidf = normalize(X_CV_text_feature_onehotencoding_tfidf, axis=0)

## STACKING THE THREE TYPES OF FEATURES 

In [None]:
X_Train_gene_var_onehotencoding_tfidf = hstack((X_Train_gene_Feature_onehotencoding_tfidf,X_Train_variation_Feature_onehotencoding_tfidf))
X_Test_gene_var_onehotencoding_tfidf =  hstack((X_Test_gene_Feature_onehotencoding_tfidf,X_Test_variation_Feature_onehotencoding_tfidf))
X_CV_gene_var_onehotencoding_tfidf = hstack((X_CV_gene_Feature_onehotencoding_tfidf,X_CV_variation_Feature_onehotencoding_tfidf))
  
X_Train_onehotcoding_tfidf = hstack((X_Train_gene_var_onehotencoding_tfidf,X_Train_feature_onehotencoding_tfidf)).tocsr()
print("Train One Hot Encoding  :", X_Train_onehotcoding_tfidf.shape)
print("="*100)

X_Test_onehotcoding_tfidf = hstack((X_Test_gene_var_onehotencoding_tfidf,X_Test_text_feature_onehotencoding_tfidf)).tocsr()
print("Test One Hot Encoding  :", X_Test_onehotcoding_tfidf.shape)
print("="*100)

X_CV_onehotcoding_tfidf = hstack((X_CV_gene_var_onehotencoding_tfidf,X_CV_text_feature_onehotencoding_tfidf)).tocsr()
print("CV One Hot Encoding  :", X_CV_onehotcoding_tfidf.shape)
print("="*100)

## NAIVE BAYES ALGORITHM

In [None]:
alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = MultinomialNB(alpha=i)
    clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(np.log10(alpha), cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)


predict_y = sig_clf.predict_proba(X_Train_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)

print("Log Loss :",log_loss(Y_CV, sig_clf_probs))

print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(X_CV_onehotcoding_tfidf)- Y_CV))/Y_CV.shape[0])
plot_confusion_matrix(Y_CV, sig_clf.predict(X_CV_onehotcoding_tfidf.toarray()))

## K-NEAREST NEIGHBORS ALGORITHM

In [None]:
alpha = [5, 11, 15, 21, 31, 41, 51, 99]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
predict_and_plot_confusionmatrix(X_Train_onehotcoding_tfidf, Y_Train, X_CV_onehotcoding_tfidf, Y_CV, clf)

In [None]:
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)

test_point_index = 1
predicted_cls = sig_clf.predict(X_Test_onehotcoding_tfidf[0].reshape(1,-1))
print("Predicted Class :", predicted_cls[0])
print("Actual Class :", Y_Test[test_point_index])
neighbors = clf.kneighbors(X_Test_onehotcoding_tfidf[test_point_index].reshape(1, -1), alpha[best_alpha])
print("The ",alpha[best_alpha]," nearest neighbours of the test points belongs to classes",Y_Train[neighbors[1][0]])
print("Fequency of nearest points :",Counter(Y_Train[neighbors[1][0]]))

## LOGISTIC REGRESSION WITH CLASS BALANCING 

In [None]:
alpha = [10 ** x for x in range(-6, 3)]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusionmatrix(X_Train_onehotcoding_tfidf, Y_Train, X_CV_onehotcoding_tfidf, Y_CV, clf)

## LOGISTIC REGRESSION WITHOUT CLASS BALANCING 

In [None]:
alpha = [10 ** x for x in range(-6, 3)]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusionmatrix(X_Train_onehotcoding_tfidf, Y_Train, X_CV_onehotcoding_tfidf, Y_CV, clf)

## LINEAR SUPPORT VECTOR MACHINE 

In [None]:
alpha = [10 ** x for x in range(-6, 3)]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(class_weight='balanced',alpha=i, penalty='l2', loss='hinge', random_state=42)
    clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(class_weight='balanced',alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42)
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_tfidf)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42,class_weight='balanced')
predict_and_plot_confusionmatrix(X_Train_onehotcoding_tfidf, Y_Train,X_CV_onehotcoding_tfidf,Y_CV, clf)

## RANDOM FOREST CLASSIFIER 

In [None]:
alpha = [100,200,500,1000,2000]
max_depth = [5, 10]
cv_log_error_array = []
for i in alpha:
    for j in max_depth:
        print("for n_estimators =", i,"and max depth = ", j)
        clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42, n_jobs=-1)
        clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
        sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
        sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
        sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
        cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
        print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

'''fig, ax = plt.subplots()
features = np.dot(np.array(alpha)[:,None],np.array(max_depth)[None]).ravel()
ax.plot(features, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[int(i/2)],max_depth[int(i%2)],str(txt)), (features[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
'''

best_alpha = np.argmin(cv_log_error_array)
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
clf.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_tfidf, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_tfidf)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_tfidf)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_tfidf)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

## STACK THE MODELS 

In [None]:
clf1 = SGDClassifier(alpha=0.001, penalty='l2', loss='log', class_weight='balanced', random_state=0)
clf1.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf1 = CalibratedClassifierCV(clf1, method="sigmoid")

In [None]:
clf2 = SGDClassifier(alpha=1, penalty='l2', loss='hinge', class_weight='balanced', random_state=0)
clf2.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf2 = CalibratedClassifierCV(clf2, method="sigmoid")

In [None]:
clf3 = MultinomialNB(alpha=0.001)
clf3.fit(X_Train_onehotcoding_tfidf, Y_Train)
sig_clf3 = CalibratedClassifierCV(clf3, method="sigmoid")

In [None]:
sig_clf1.fit(X_Train_onehotcoding_tfidf, Y_Train)
print("Logistic Regression :  Log Loss: %0.2f" % (log_loss(Y_CV, sig_clf1.predict_proba(X_CV_onehotcoding_tfidf))))
sig_clf2.fit(X_Train_onehotcoding_tfidf, Y_Train)
print("Support vector machines : Log Loss: %0.2f" % (log_loss(Y_CV, sig_clf2.predict_proba(X_CV_onehotcoding_tfidf))))

In [None]:
sig_clf3.fit(X_Train_onehotcoding_tfidf, Y_Train)
print("Naive Bayes : Log Loss: %0.2f" % (log_loss(Y_CV, sig_clf3.predict_proba(X_CV_onehotcoding_tfidf))))
print("-"*50)

In [None]:
alpha = [0.0001,0.001,0.01,0.1,1,10] 
best_alpha = 999
for i in alpha:
    lr = LogisticRegression(C=i)
    sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3], meta_classifier=lr, use_probas=True)
    sclf.fit(X_Train_onehotcoding_tfidf, Y_Train)
    print("Stacking Classifer : for the value of alpha: %f Log Loss: %0.3f" % (i, log_loss(Y_CV, sclf.predict_proba(X_CV_onehotcoding_tfidf))))
    log_error =log_loss(Y_CV , sclf.predict_proba(X_CV_onehotcoding_tfidf))
    if best_alpha > log_error:
        best_alpha = log_error

In [None]:
lr = LogisticRegression(C=0.1)
sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3], meta_classifier=lr, use_probas=True)
sclf.fit(X_Train_onehotcoding_tfidf, Y_Train)

log_error = log_loss(Y_Train, sclf.predict_proba(X_Train_onehotcoding_tfidf))
print("Log loss (train) on the stacking classifier :",log_error)

log_error = log_loss(Y_CV, sclf.predict_proba(X_CV_onehotcoding_tfidf))
print("Log loss (CV) on the stacking classifier :",log_error)

log_error = log_loss(Y_Test, sclf.predict_proba(X_Test_onehotcoding_tfidf))
print("Log loss (test) on the stacking classifier :",log_error)

print("Number of missclassified point :", np.count_nonzero((sclf.predict(X_Test_onehotcoding_tfidf)- Y_Test))/Y_Test.shape[0])
plot_confusion_matrix(test_y=Y_Test, predict_y=sclf.predict(X_Test_onehotcoding_tfidf))

# LOGISTIC REGRESSION WITH COUNTVECTORIZER USING BIGRAM 

## CountVectorizer on Gene Features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer_bigram=CountVectorizer(ngram_range=(1,2))

X_Train_gene_Feature_onehotencoding_bigram=count_vectorizer_bigram.fit_transform(X_Train["Gene"])
print(" Train Gene Feature :" ,X_Train_gene_Feature_onehotencoding_bigram.shape)

print("="*100)

X_Test_gene_Feature_onehotencoding_bigram=count_vectorizer_bigram.transform(X_Test["Gene"])
print(" Test Gene Feature :" ,X_Test_gene_Feature_onehotencoding_bigram.shape)

print("="*100)


X_CV_gene_Feature_onehotencoding_bigram=count_vectorizer_bigram.transform(X_CV["Gene"])
print(" CV Gene Feature :" ,X_CV_gene_Feature_onehotencoding_bigram.shape)

print("="*100)

## CountVectorizer on Variation Features

In [None]:
variation_vectorizer_bigram=CountVectorizer(ngram_range=(1,2))

X_Train_variation_Feature_onehotencoding_bigram=variation_vectorizer_bigram.fit_transform(X_Train["Variation"])
print(" Train Variation Feature :" ,X_Train_variation_Feature_onehotencoding_bigram.shape)

print("="*100)

X_Test_variation_Feature_onehotencoding_bigram=variation_vectorizer_bigram.transform(X_Test["Variation"])
print(" Test Variation Feature :" ,X_Test_variation_Feature_onehotencoding_bigram.shape)

print("="*100)


X_CV_variation_Feature_onehotencoding_bigram=variation_vectorizer_bigram.transform(X_CV["Variation"])
print(" CV Variation Feature :" ,X_CV_variation_Feature_onehotencoding_bigram.shape)

print("="*100)

## CountVectorizer on Text Features

In [None]:
from sklearn.preprocessing import normalize

text_vectorizer_bigram = CountVectorizer(min_df=3,ngram_range=(1,2))
X_Train_feature_onehotencoding_bigram = text_vectorizer_bigram.fit_transform(X_Train['TEXT'])


X_Train_feature_onehotencoding_bigram = normalize(X_Train_feature_onehotencoding_bigram, axis=0)
print("Train Text Feature :", X_Train_feature_onehotencoding_bigram.shape)

print("="*100)

X_Test_text_feature_onehotencoding_bigram = text_vectorizer_bigram.transform(X_Test['TEXT'])
print("Test Text Feature :", X_Test_text_feature_onehotencoding_bigram.shape)

print("="*100)

X_Test_text_feature_onehotencoding_bigram = normalize(X_Test_text_feature_onehotencoding_bigram, axis=0)

X_CV_text_feature_onehotencoding_bigram = text_vectorizer_bigram.transform(X_CV['TEXT'])
print("CV Text Feature :", X_CV_text_feature_onehotencoding_bigram.shape)

print("="*100)

X_CV_text_feature_onehotencoding_bigram = normalize(X_CV_text_feature_onehotencoding_bigram, axis=0)

## STACKING THE 3 TYPES OF FEATURES

In [None]:
X_Train_gene_var_onehotencoding_bigram = hstack((X_Train_gene_Feature_onehotencoding_bigram,X_Train_variation_Feature_onehotencoding_bigram))
X_Test_gene_var_onehotencoding_bigram =  hstack((X_Test_gene_Feature_onehotencoding_bigram,X_Test_variation_Feature_onehotencoding_bigram))
X_CV_gene_var_onehotencoding_bigram = hstack((X_CV_gene_Feature_onehotencoding_bigram,X_CV_variation_Feature_onehotencoding_bigram))
  
X_Train_onehotcoding_bigram = hstack((X_Train_gene_var_onehotencoding_bigram,X_Train_feature_onehotencoding_bigram)).tocsr()
print("Train One Hot Encoding  :", X_Train_onehotcoding_bigram.shape)
print("="*100)

X_Test_onehotcoding_bigram = hstack((X_Test_gene_var_onehotencoding_bigram,X_Test_text_feature_onehotencoding_bigram)).tocsr()
print("Test One Hot Encoding  :", X_Test_onehotcoding_bigram.shape)
print("="*100)

X_CV_onehotcoding_bigram = hstack((X_CV_gene_var_onehotencoding_bigram,X_CV_text_feature_onehotencoding_bigram)).tocsr()
print("CV One Hot Encoding  :", X_CV_onehotcoding_bigram.shape)
print("="*100)

## LOGISTIC REGRESSION WITH CLASS BALANCING

In [None]:
alpha = [10 ** x for x in range(-6, 3)]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(X_Train_onehotcoding_bigram, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_bigram, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_bigram)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_onehotcoding_bigram, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_bigram, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_bigram)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_bigram)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_bigram)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusionmatrix(X_Train_onehotcoding_bigram, Y_Train, X_CV_onehotcoding_bigram, Y_CV, clf)

## LOGISTIC REGRESSION WITHOUT CLASS BALANCING

In [None]:
alpha = [10 ** x for x in range(-6, 3)]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(X_Train_onehotcoding_bigram, Y_Train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_Train_onehotcoding_bigram, Y_Train)
    sig_clf_probs = sig_clf.predict_proba(X_CV_onehotcoding_bigram)
    cv_log_error_array.append(log_loss(Y_CV, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(Y_CV, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(X_Train_onehotcoding_bigram, Y_Train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_Train_onehotcoding_bigram, Y_Train)

predict_y = sig_clf.predict_proba(X_Train_onehotcoding_bigram)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(Y_Train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_CV_onehotcoding_bigram)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(Y_CV, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_Test_onehotcoding_bigram)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(Y_Test, predict_y, labels=clf.classes_, eps=1e-15))

In [None]:
clf = SGDClassifier( alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusionmatrix(X_Train_onehotcoding_bigram, Y_Train, X_CV_onehotcoding_bigram, Y_CV, clf)

# CONCLUSION

In [None]:
pip install -U PTable

In [None]:
from prettytable import PrettyTable
x= PrettyTable()
x.title = "TF-IDF Featurization"
x.field_names = ["Algorithm" , "Train loss", "CV(Log Loss)", "Test loss","%age of MissClassification"]
x.add_row(["Naive Bayes",0.81,1.24,1.21,"42.00%"])
x.add_row(["K-NN",0.97,1.26,1.28,"43.23%"])
x.add_row(["Logistic Regression(With Class Balancing)",0.36,1.05,1.05,"34.39%"])
x.add_row(["Logistic Regression(Without Class Balancing)",0.35,1.08,1.06,"35.33%"])
x.add_row(["Linear SVM",0.67,1.20,1.22,"40.03%"])
x.add_row(["Stacking Model",0.58,1.19,1.23,"39.84%"])
print(x)
y= PrettyTable()
y.title = "CountVectorizer BIGRAM"
y.field_names = ["Algorithm" , "Train loss", "CV(Log Loss)", "Test loss","%age of MissClassification"]
y.add_row(["Linear Regression(With Class Balancing)",0.68,1.16,1.19,"35.33%"])
y.add_row(["Linear Regression(Without Class Balancing)",0.63,1.19,1.19,"35.33%"])
print(y)

### BY LOOKING AT THE PREETY TABLE , LOOKS LIKE TF-IDF FEATURIZATION USING LOGISTIC REGRESSION(WITHOUT CLASS BALANCING) WORKS WELL HAVING 0.36 TRAINING LOSS AND 34.39% MISS CLASSIFICATION POINTS. 
### TEST LOSS = 1.05 