##    
# Prediction Of Survivability of Children undergoing HSCT - Experiment I
##   

###  
## Importing Required Libraries
###  

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

plt.style.use('dark_background')

import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

###  
## Reading CSV File (Dataset)
###  

In [None]:
df = pd.read_csv("C:/rishabh's space/IIIT BHAGALPUR M.TECH/Course Work Assignments and Various Files/FInal Year project/major project 1/dataset.csv")
df.head()

## Info about dataset

### Size of Dataset

In [None]:
df.shape

### Info about features

In [None]:
df.info()

### Checking NULL values

In [None]:
df.isna().sum()

### Checking number of unique values for object type features

In [None]:
{column: len(df[column].unique()) for column in df.select_dtypes('object').columns}

### Listing of unique values for object type features

In [None]:
{column: list(df[column].unique()) for column in df.select_dtypes('object').columns}

## Pre-Processing

### Selecting features having object type as their datatype

In [None]:
columns=df.select_dtypes('object').columns

### Listing out the features having object datatype

In [None]:
columns

### Dropping the rows having '?' as their value (removing missing data)

In [None]:
for i in columns:
    df=df[df[i] != '?']

### Checking the new size of the dataframe

In [None]:
df.shape

### Checking number of unique values for object type features (checking removal of '?' as value)

In [None]:
{column: len(df[column].unique()) for column in df.select_dtypes('object').columns}

### Listing of unique values for object type features (checking removal of '?' as value)

In [None]:
{column: list(df[column].unique()) for column in df.select_dtypes('object').columns}

### Creating a custom function to convert datatype for features having numerical type values

In [None]:
def type_covertor(x):
    if x == '?':
        return 0
    else: return float(x)
   

### Converting values from object to numerical type

In [None]:
df['recipient_body_mass'] = df['recipient_body_mass'].apply(type_covertor)

### Checking the change of datatype for the feature

In [None]:
df.info()

### Converting values from object to numerical type

In [None]:
df['CD3_x1e8_per_kg'] = df['CD3_x1e8_per_kg'].apply(type_covertor)

### Checking the change of datatype for the feature

In [None]:
df.info()

### Converting values from object to numerical type

In [None]:
df['CD3_to_CD34_ratio'] = df['CD3_to_CD34_ratio'].apply(type_covertor)

### Checking the change of datatype for the feature

In [None]:
df.info()

### Listing of unique values for object type features (checking for the remaining object type features)

In [None]:
{column: list(df[column].unique()) for column in df.select_dtypes('object').columns}

## Encoding

In [None]:
x= {column: len(df[column].unique()) == 2 for column in df.select_dtypes('object').columns}

### Selecting features having binary type values

In [None]:
binary=[]
for i in x:
    if x[i] == True:
        binary.append(i)

In [None]:
binary

### Binary encoding

In [None]:
def encode_binary(x):
    if x == 'no':
        return 0
    elif x == 'yes':
        return 1
    elif x =="present":
        return 1
    elif x =='absent':
        return 0
    elif x =="female":
        return 0
    elif x =='male':
        return 1
    elif x =="minus":
        return 0
    elif x =='plus':
        return 1
    elif x =="nonmalignant":
        return 0
    elif x =='malignant':
        return 1
    elif x =="other":
        return 0
    elif x =='female_to_male':
        return 1
    elif x =="mismatched":
        return 0
    elif x =='matched':
        return 1
    elif x =="low":
        return 0
    elif x =='high':
        return 1
    elif x =="peripheral_blood":
        return 0
    elif x =='bone_marrow':
        return 1

In [None]:
for i in binary:
    df[i] = df[i].apply(encode_binary)

### Checking the change of datatype for the feature

In [None]:
df.info()

### Listing of unique values for object type features (checking for the remaining object type features)

In [None]:
{column: list(df[column].unique()) for column in df.select_dtypes('object').columns}

### Selecting features having nominal type values

In [None]:
nominal=['donor_ABO','recipient_ABO','disease','HLA_group_1']

### Selecting features having ordinal type values

In [None]:
ordinal=['recipient_age_int','HLA_match']

### Selecting features having numerical type values

In [None]:
numercical=['CMV_status','antigen','allel']

### Custom function for converting datatype to numerical type

In [None]:
def int_type_covertor(x):
    if x == '?':
        return 0
    else: return int(x)
   

### Converting values from object to numerical type

In [None]:
for i in numercical:
    df[i] = df[i].apply(int_type_covertor)

### Checking the change of datatype for the feature

In [None]:
df.info()

### One-Hot Encoding for features have ordinal type values

In [None]:
for i in ordinal:
    df = pd.get_dummies(df, prefix=[i], columns=[i])

### Checking the new size of dataframes (Increasae in number of features due to One-Hot Encoding)

In [None]:
df.shape

### Checking the change of datatype for the feature

In [None]:
df.info()

### One-Hot Encoding for features have nominal type values

In [None]:
for i in nominal:
    df = pd.get_dummies(df, prefix=[i], columns=[i])

### Checking the new size of dataframes (Increasae in number of features due to One-Hot Encoding)

In [None]:
df.shape

### Checking the change of datatype for the feature

In [None]:
df.info()

###  
## Model Building
###  

In [None]:
x = df.drop(['survival_status'], axis = 1)
y = df['survival_status'].astype(int)

###  
### Splitting the dataset into train:80% and test:20%
###  

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0 , test_size = 0.2)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

###  
### Lists to hold the values to be printed
###  

In [None]:
accuracy_comp = []

In [None]:
precision_comp = []

In [None]:
recall_comp = []

In [None]:
f1score_comp = []

## Models/Algorithms

###  
## Logistic Regression
###  

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
predictions = lr.predict(x_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

###  
## Decision Tree Classifier
###  

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
predictions = dt.predict(x_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

###  
## Random Forest Classifier
###  

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
predictions = rf.predict(x_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

###  
## XGBoost
###  

In [None]:
import xgboost as xgb
xgb = xgb.XGBClassifier()
xgb.fit(x_train, y_train)
pred = xgb.predict(x_test)
acc = accuracy_score(y_test, predictions)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

###  
## KNN
###  

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train,y_train)
predictions = knn.predict(x_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

###  
## Naive Bayes - Bernoulli
###  

In [None]:
from sklearn.naive_bayes import BernoulliNB
nbb = BernoulliNB()
nbb.fit(x_train, y_train)
predictions = nbb.predict(x_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

## SVM
### 

In [None]:
from sklearn.svm import SVC
from sklearn import preprocessing
X_train = preprocessing.scale(x_train)
X_test = preprocessing.scale(x_test)
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

## Perceptron
### 

In [None]:
from sklearn.linear_model import Perceptron
clf=Perceptron(fit_intercept=False, max_iter=10, tol=None,shuffle=False).fit(x_train, y_train)
predictions = clf.predict(x_test)


print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

## Multi Layer Perceptron
### 

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler() 
scaler.fit(x_train)  
X_train = scaler.transform(x_train)  
X_test = scaler.transform(x_test) 

from sklearn.neural_network import MLPClassifier
clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf2.fit(X_train, y_train)
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')
predictions = clf2.predict(X_test)


print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

## Naive Bayes Gaussian

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
X_train = preprocessing.scale(x_train)
X_test = preprocessing.scale(x_test)
nbb = GaussianNB()
nbb.fit(X_train, y_train)
predictions = nbb.predict(X_test)

print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('\nClassification Report:\n\n',classification_report(y_test, predictions))
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
acc_perc = np.round(acc*100, 2)
print('\nAccuracy=',str(acc_perc)+'%')

accuracy_comp.append(acc_perc)
precision_comp.append(prec)
recall_comp.append(rec)
f1score_comp.append(f1)

###  
## RESULTS
###  

In [None]:
plt.figure(figsize=(10,5))
plt.plot(['Logistic \nRegression','Decision \nTree \nClassifier','Random \nForest \nClassfier',
         'XGBoost','KNN','Naive \nBayes-\nBernoulli','SVM','Perceptron','MLP','Naive \nBayes-\nGaussian'],accuracy_comp, 'o-')
plt.xlabel("Models")
plt.ylabel("Accuracy %")
plt.title("Comparison Of Accuracy of the used ML models")

In [None]:
models= ['Logistic Regression','Decision Tree Classifier','Random Forest Classfier',
         'XGBoost','KNN','Naive Bayes-Bernoulli','SVM','Pereceptron','MLP','Naive \nBayes-\nGaussian']
print('Accuracies of models are:\n')
for i in range(10) :
        print(models[i],':',str(accuracy_comp[i])+'%')

In [None]:
plt.figure(figsize=(10,5))
plt.plot(['Logistic \nRegression','Decision \nTree \nClassifier','Random \nForest \nClassfier',
         'XGBoost','KNN','Naive \nBayes-\nBernoulli','SVM','Pereceptron','MLP','Naive \nBayes-\nGaussian'],precision_comp, 'o-')
plt.xlabel("Models")
plt.ylabel("Precision")
plt.title("Comparison Of Precision of the used ML models")

In [None]:
models= ['Logistic Regression','Decision Tree Classifier','Random Forest Classfier',
         'XGBoost','KNN','Naive Bayes-Bernoulli','SVM','Perceptron','MLP','Naive \nBayes-\nGaussian']
print('Precision of models are:\n')
for i in range(10) :
        print(models[i],':',str(precision_comp[i]))

In [None]:
plt.figure(figsize=(10,5))
plt.plot(['Logistic \nRegression','Decision \nTree \nClassifier','Random \nForest \nClassfier',
         'XGBoost','KNN','Naive \nBayes-\nBernoulli','SVM','Perceptron','MLP','Naive \nBayes-\nGaussian'],recall_comp, 'o-')
plt.xlabel("Models")
plt.ylabel("Recall")
plt.title("Comparison Of Recall of the used ML models")

In [None]:
models= ['Logistic Regression','Decision Tree Classifier','Random Forest Classfier',
         'XGBoost','KNN','Naive Bayes-Bernoulli','SVM','Perceptron','MLP','Naive \nBayes-\nGaussian']
print('Recall of models are:\n')
for i in range(10) :
        print(models[i],':',str(recall_comp[i]))

In [None]:
plt.figure(figsize=(10,5))
plt.plot(['Logistic \nRegression','Decision \nTree \nClassifier','Random \nForest \nClassfier',
         'XGBoost','KNN','Naive \nBayes-\nBernoulli','SVM','Perceptron','MLP','Naive \nBayes-\nGaussian'],f1score_comp, 'o-')
plt.xlabel("Models")
plt.ylabel("F1-Score")
plt.title("Comparison Of F1-Score of the used ML models")

In [None]:
models= ['Logistic Regression','Decision Tree Classifier','Random Forest Classfier',
         'XGBoost','KNN','Naive Bayes-Bernoulli','SVM','Perceptron','MLP','Naive \nBayes-\nGaussian']
print('F1-Score of models are:\n')
for i in range(10) :
        print(models[i],':',str(f1score_comp[i]))

## 
# END OF PROJECT
## 