**Attribute information:**

1. **target**: DIE (1), LIVE (2)
2. **age**: 10, 20, 30, 40, 50, 60, 70, 80
3. **gender**: male (1), female (2)

           ------ no = 2,   yes = 1 ------

4. **steroid**: no, yes 
5. **antivirals**: no, yes 
6. **fatique**: no, yes 
7. **malaise**: no, yes 
8. **anorexia**: no, yes 
9. **liverBig**: no, yes 
10. **liverFirm**: no, yes 
11. **spleen**: no, yes 
12. **spiders**: no, yes
13. **ascites**: no, yes 
14. **varices**: no, yes
15. **histology**: no, yes


16. **bilirubin**: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 -- 
17. **alk**: 33, 80, 120, 160, 200, 250 ---
18. **sgot**: 13, 100, 200, 300, 400, 500, ---
19. **albu**: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0, --- 
20. **protime**: 10, 20, 30, 40, 50, 60, 70, 80, 90, --- 

        NA's are represented with "?"

## Dataset Reading and Pre-Processing steps

import required libraries

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [None]:
#Code to ignore warnings
import warnings
warnings.filterwarnings("ignore")

###### 1. Read the HEPATITIS dataset and check the data shapes

In [None]:
## Read "hepatitis.csv" using pandas
# target =  1: Die; 2: Live 
data = pd.read_csv("../input/hepatitis-dataset/hepatitis.csv", na_values="?")

In [None]:
data.shape

In [None]:
data.head()

###### 2. Check basic summary statistics of the data

In [None]:
data.describe()

###### 3. Check for value counts in target variable

In [None]:
data.target.value_counts()

#### 4. Check the datatype of each variable

In [None]:
data.dtypes

In [None]:
cat_cols = data.columns[data.nunique() < 5]

In [None]:
num_cols = data.columns[data.nunique() >= 5]

#### 5. Drop columns which are not significant

In [None]:
data.drop(["ID"], axis = 1, inplace=True)
num_cols = data.columns[data.nunique() >= 5]

In [None]:
data.head()

#### 6. Identify the Categorical Columns and store them in a variable cat_cols and numerical into num_cols

In [None]:
num_cols = ["age", "bili", "alk", "sgot", "albu", "protime"]
cat_cols = ['gender', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia', 'liverBig', 
            'liverFirm', 'spleen', 'spiders', 'ascites', 'varices', 'histology']

#### 7. Checking the null values

In [None]:
data.isna().sum()

In [None]:
data.isnull().sum()

#### 8. Split the data into X and y

In [None]:
X = data.drop(["target"], axis = 1)

In [None]:
y = data["target"]

In [None]:
print(X.shape, y.shape)

#### 9. Split the data into X_train, X_test, y_train, y_test with test_size = 0.20 using sklearn

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
## Print the shape of X_train, X_test, y_train, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### 10. Check null values in train and test, check value_counts in y_train and y_test

In [None]:
print(y_train.value_counts()/X_train.shape[0])

In [None]:
print(y_test.value_counts()/X_test.shape[0])

In [None]:
# null values in train
X_train.isna().sum()

In [None]:
# null values in test
X_test.isna().sum()

#### 11. Impute the Categorical Columns with mode and Numerical columns with mean

In [None]:
df_cat_train = X_train[cat_cols]
df_cat_test = X_test[cat_cols]

In [None]:
# Impute on train
# df_cat_train = df_cat_train.fillna(df_cat_train.mode().iloc[0])

# Impute on test
# df_cat_test = df_cat_test.fillna(df_cat_train.mode().iloc[0])

In [None]:
from sklearn.impute import SimpleImputer
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(df_cat_train)

In [None]:
df_cat_train = pd.DataFrame(cat_imputer.transform(df_cat_train), columns=cat_cols)

In [None]:
df_cat_test = pd.DataFrame(cat_imputer.transform(df_cat_test), columns=cat_cols)

In [None]:
df_num_train = X_train[num_cols]
df_num_test = X_test[num_cols]

In [None]:
# Impute on train
# df_num_train = df_num_train.fillna(df_num_train.mean())

#Impute on test
# df_num_test = df_num_test.fillna(df_num_train.mean())

In [None]:
num_imputer = SimpleImputer(strategy='median')
num_imputer.fit(df_num_train[num_cols])

In [None]:
df_num_train = pd.DataFrame ( num_imputer.transform(df_num_train), columns= num_cols)

In [None]:
df_num_test =  pd.DataFrame(num_imputer.transform(df_num_test), columns=num_cols)

In [None]:
# Combine numeric and categorical in train
X_train = pd.concat([df_num_train, df_cat_train], axis = 1)

# Combine numeric and categorical in test
X_test = pd.concat([df_num_test, df_cat_test], axis = 1)

In [None]:
X_train.isna().sum()

In [None]:
X_test.isna().sum()

#### Convert all the categorical columns to Integer Format before dummification (2.0 as 2 etc.)

In [None]:
# Train
X_train[cat_cols] = X_train[cat_cols].astype('int')

# Test
X_test[cat_cols] = X_test[cat_cols].astype('int')

#### 12. Dummify the Categorical columns

In [None]:
## Convert Categorical Columns to Dummies
# Train
X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)

# Test
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

In [None]:
X_train.columns

In [None]:
X_test.columns

#### 13. Scale the numeric attributes ["age", "bili", "alk", "sgot", "albu", "protime"]

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#num_cols = ["age", "bili", "alk", "sgot", "albu", "protime"]
scaler = StandardScaler()

scaler.fit(X_train.loc[:,num_cols])

# scale on train
X_train.loc[:,num_cols] = scaler.transform(X_train.loc[:,num_cols])
#X_train[num_cols] = scaler.transform(X_train[num_cols])

# scale on test
X_test.loc[:,num_cols] = scaler.transform(X_test.loc[:,num_cols])

## MODEL BUILDING - SVM

In [None]:
from sklearn.svm import SVC

In [None]:
# Create a SVC classifier using a linear kernel
linear_svm = SVC(kernel='linear', C=1, random_state=0)

In [None]:
# Train the classifier
linear_svm.fit(X=X_train, y= y_train)

In [None]:
## Predict
train_predictions = linear_svm.predict(X_train)
test_predictions = linear_svm.predict(X_test)

### Train data accuracy
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

print("TRAIN Conf Matrix : \n", confusion_matrix(y_train, train_predictions))
print("\nTRAIN DATA ACCURACY",accuracy_score(y_train,train_predictions))
print("\nTrain data f1-score for class '1'",f1_score(y_train,train_predictions,pos_label=1))
print("\nTrain data f1-score for class '2'",f1_score(y_train,train_predictions,pos_label=2))

### Test data accuracy
print("\n\n--------------------------------------\n\n")

print("TEST Conf Matrix : \n", confusion_matrix(y_test, test_predictions))
print("\nTEST DATA ACCURACY",accuracy_score(y_test,test_predictions))
print("\nTest data f1-score for class '1'",f1_score(y_test,test_predictions,pos_label=1))
print("\nTest data f1-score for class '2'",f1_score(y_test,test_predictions,pos_label=2))

####  Non Linear SVM (RBF)

Radial Basis Function is a commonly used kernel in SVC:<br>

<img src="rbf_kernel.png">

where <math xmlns="http://www.w3.org/1998/Math/MathML">
  <mrow class="MJX-TeXAtom-ORD">
    <mo stretchy="false">|</mo>
  </mrow>
  <mrow class="MJX-TeXAtom-ORD">
    <mo stretchy="false">|</mo>
  </mrow>
  <mrow class="MJX-TeXAtom-ORD">
    <mi mathvariant="bold">x</mi>
      <sub>i</sub>
  </mrow>
  <mo>&#x2212;<!-- − --></mo>
  <mrow class="MJX-TeXAtom-ORD">
    <msup>
      <mi mathvariant="bold">x</mi>
      <sub>j</sub>
    </msup>
  </mrow>
  <mrow class="MJX-TeXAtom-ORD">
    <mo stretchy="false">|</mo>
  </mrow>
  <msup>
    <mrow class="MJX-TeXAtom-ORD">
      <mo stretchy="false">|</mo>
    </mrow>
    <mrow class="MJX-TeXAtom-ORD">
      <sup>2</sup>
    </mrow>
  </msup>
</math>  is the squared Euclidean distance between two data points x<sub>i</sub> and x<sub>j</sub>

It is only important to know that an SVC classifier using an RBF kernel has two parameters: gamma and C.

<strong>Gamma:</strong>

- Gamma is a parameter of the RBF kernel and can be thought of as the ‘spread’ of the kernel and therefore the decision region. When gamma is low, the ‘curve’ of the decision boundary is very low and thus the decision region is very broad. When gamma is high, the ‘curve’ of the decision boundary is high, which creates islands of decision-boundaries around data points.

<strong>C:</strong>

- C is a parameter of the SVC learner and is the penalty for misclassifying a data point. When C is small, the classifier is okay with misclassified data points (high bias, low variance). When C is large, the classifier is heavily penalized for misclassified data and therefore bends over backwards avoid any misclassified data points (low bias, high variance).


<strong>Kernel Trick:</strong><br>
Image you have a two-dimensional non-linearly separable dataset, you would like to classify it using SVM. It looks like not possible because the data is not linearly separable. However, if we transform the two-dimensional data to a higher dimension, say, three-dimension or even ten-dimension, we would be able to find a hyperplane to separate the data.

<img src="kernel_trick.png">

The problem is, if we have a large dataset containing, say, millions of examples, the transformation will take a long time to run.<br>
To solve this problem, we actually only care about the result of the dot product (x<sub>i</sub>.x<sub>j</sub>)<br>
<br>If there is a function which could calculate the dot product and the result is the same as when we transform the data into higher dimension, it would be fantastic. This function is called a kernel function.<br>
<br>In essence, what the kernel trick does for us is to offer a more efficient and less expensive way to transform data into higher dimensions.

In [None]:
## Create an SVC object and print it to see the arguments
svc = SVC(kernel='rbf', random_state=0, gamma=0.01, C=1)
svc

In [None]:
## Train the model
svc.fit(X=X_train, y= y_train)

In [None]:
## Predict
train_predictions = svc.predict(X_train)
test_predictions = svc.predict(X_test)

### Train data accuracy

print("TRAIN Conf Matrix : \n", confusion_matrix(y_train, train_predictions))
print("\nTRAIN DATA ACCURACY",accuracy_score(y_train,train_predictions))
print("\nTrain data f1-score for class '1'",f1_score(y_train,train_predictions,pos_label=1))
print("\nTrain data f1-score for class '2'",f1_score(y_train,train_predictions,pos_label=2))

### Test data accuracy
print("\n\n--------------------------------------\n\n")

print("TEST Conf Matrix : \n", confusion_matrix(y_test, test_predictions))
print("\nTEST DATA ACCURACY",accuracy_score(y_test,test_predictions))
print("\nTest data f1-score for class '1'",f1_score(y_test,test_predictions,pos_label=1))
print("\nTest data f1-score for class '2'",f1_score(y_test,test_predictions,pos_label=2))

### SVM with Grid Search for Paramater Tuning

In [None]:
## Use Grid Search for parameter tuning

from sklearn.model_selection import GridSearchCV

svc_grid = SVC()
 
param_grid = { 
                'C': [0.001, 0.01, 0.1, 1, 10, 100 ],
                'gamma': [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
                'kernel':['linear', 'rbf', 'poly' ]
             }

svc_cv_grid = GridSearchCV(estimator = svc_grid, param_grid = param_grid, cv = 5, verbose=3)

In [None]:
## Fit the grid search model
svc_cv_grid.fit(X=X_train, y=y_train)

In [None]:
# Get the best parameters
svc_cv_grid.best_params_

In [None]:
svc_best = svc_cv_grid.best_estimator_

In [None]:
## Predict
train_predictions = svc_best.predict(X_train)
test_predictions = svc_best.predict(X_test)

print("TRAIN DATA ACCURACY",accuracy_score(y_train,train_predictions))
print("\nTrain data f1-score for class '1'",f1_score(y_train,train_predictions,pos_label=1))
print("\nTrain data f1-score for class '2'",f1_score(y_train,train_predictions,pos_label=2))

### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST DATA ACCURACY",accuracy_score(y_test,test_predictions))
print("\nTest data f1-score for class '1'",f1_score(y_test,test_predictions,pos_label=1))
print("\nTest data f1-score for class '2'",f1_score(y_test,test_predictions,pos_label=2))