# Hepatitis dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# reading the first few lines of the dataset
df = pd.read_csv('hepatitis (copy).csv')
df

Unnamed: 0,DIE_LIVE,AGE,SEX,STEROID,ANTIVIRAL,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.00,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.90,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.70,96,32,4.0,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.70,46,52,4.0,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,200,4.0,?,1
5,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.90,95,28,4.0,75,1
6,1,51,1,1,2,1,2,1,2,2,1,1,2,2,?,?,?,?,?,1
7,2,23,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,?,?,?,1
8,2,39,1,2,2,1,2,2,2,1,2,2,2,2,0.70,?,48,4.4,?,1
9,2,30,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,120,3.9,?,1


In [4]:
df.dtypes

DIE_LIVE            int64
AGE                 int64
SEX                 int64
STEROID            object
ANTIVIRAL           int64
FATIGUE            object
MALAISE            object
ANOREXIA           object
LIVER_BIG          object
LIVER_FIRM         object
SPLEEN_PALPABLE    object
SPIDERS            object
ASCITES            object
VARICES            object
BILIRUBIN          object
ALK_PHOSPHATE      object
SGOT               object
ALBUMIN            object
PROTIME            object
HISTOLOGY           int64
dtype: object

In [5]:
#
df.describe()

Unnamed: 0,DIE_LIVE,AGE,SEX,ANTIVIRAL,HISTOLOGY
count,155.0,155.0,155.0,155.0,155.0
mean,1.793548,41.2,1.103226,1.845161,1.451613
std,0.40607,12.565878,0.30524,0.362923,0.499266
min,1.0,7.0,1.0,1.0,1.0
25%,2.0,32.0,1.0,2.0,1.0
50%,2.0,39.0,1.0,2.0,1.0
75%,2.0,50.0,1.0,2.0,2.0
max,2.0,78.0,2.0,2.0,2.0


### the cells with values only 2/1 represent either no/yes or Male/Female .

## step 1 : cleaning the data

In [6]:
#accessing the missing datacell values
df.get_value(3,'STEROID')

  


'?'

In [7]:
#accesing the value using the .iat[]
df.iat[3,3]

'?'

In [8]:
# dropping duplicated columns (if any, will keep the first entry)
df.drop_duplicates(inplace = True)

In [9]:
df.head(2)

Unnamed: 0,DIE_LIVE,AGE,SEX,STEROID,ANTIVIRAL,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1


In [10]:
# Replacing the '?' values with NAN values
df['STEROID'] = df['STEROID'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['STEROID'] = df['STEROID'].astype(object).astype(float)

In [11]:
df['STEROID'].head(4)

0    1.0
1    1.0
2    2.0
3    NaN
Name: STEROID, dtype: float64

In [12]:
# Replacing the '?' values with NAN values
df['FATIGUE'] = df['FATIGUE'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['FATIGUE'] = df['FATIGUE'].astype(object).astype(float)

In [13]:
# Replacing the '?' values with NAN values
df['MALAISE'] = df['MALAISE'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['MALAISE'] = df['MALAISE'].astype(object).astype(float)

In [14]:
# Replacing the '?' values with NAN values
df['ANOREXIA'] = df['ANOREXIA'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['ANOREXIA'] = df['ANOREXIA'].astype(object).astype(float)

In [15]:
# Replacing the '?' values with NAN values
df['LIVER_BIG'] = df['LIVER_BIG'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['LIVER_BIG'] = df['LIVER_BIG'].astype(object).astype(float)

In [16]:
# Replacing the '?' values with NAN values
df['LIVER_FIRM'] = df['LIVER_FIRM'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['LIVER_FIRM'] = df['LIVER_FIRM'].astype(object).astype(float)

In [17]:
# Replacing the '?' values with NAN values
df['SPLEEN_PALPABLE'] = df['SPLEEN_PALPABLE'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['SPLEEN_PALPABLE'] = df['SPLEEN_PALPABLE'].astype(object).astype(float)

In [18]:
# Replacing the '?' values with NAN values
df['SPIDERS'] = df['SPIDERS'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['SPIDERS'] = df['SPIDERS'].astype(object).astype(float)

In [19]:
# Replacing the '?' values with NAN values
df['ASCITES'] = df['ASCITES'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['ASCITES'] = df['ASCITES'].astype(object).astype(float)

In [20]:
# Replacing the '?' values with NAN values
df['VARICES'] = df['VARICES'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['VARICES'] = df['VARICES'].astype(object).astype(float)

In [21]:
# Replacing the '?' values with NAN values
df['BILIRUBIN'] = df['BILIRUBIN'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['BILIRUBIN'] = df['BILIRUBIN'].astype(object).astype(float)

In [22]:
# Replacing the '?' values with NAN values
df['ALK_PHOSPHATE'] = df['ALK_PHOSPHATE'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['ALK_PHOSPHATE'] = df['ALK_PHOSPHATE'].astype(object).astype(float)

In [23]:
# Replacing the '?' values with NAN values
df['SGOT'] = df['SGOT'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['SGOT'] = df['SGOT'].astype(object).astype(float)

In [24]:
# Replacing the '?' values with NAN values
df['ALBUMIN'] = df['ALBUMIN'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['ALBUMIN'] = df['ALBUMIN'].astype(object).astype(float)

In [25]:
# Replacing the '?' values with NAN values
df['PROTIME'] = df['PROTIME'].replace('?',np.NAN)

# And Changing the datatypes of the column from object to float64
df['PROTIME'] = df['PROTIME'].astype(object).astype(float)

In [26]:
df.dtypes

DIE_LIVE             int64
AGE                  int64
SEX                  int64
STEROID            float64
ANTIVIRAL            int64
FATIGUE            float64
MALAISE            float64
ANOREXIA           float64
LIVER_BIG          float64
LIVER_FIRM         float64
SPLEEN_PALPABLE    float64
SPIDERS            float64
ASCITES            float64
VARICES            float64
BILIRUBIN          float64
ALK_PHOSPHATE      float64
SGOT               float64
ALBUMIN            float64
PROTIME            float64
HISTOLOGY            int64
dtype: object

In [27]:
df.describe()

Unnamed: 0,DIE_LIVE,AGE,SEX,STEROID,ANTIVIRAL,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
count,155.0,155.0,155.0,154.0,155.0,154.0,154.0,154.0,145.0,144.0,150.0,150.0,150.0,150.0,149.0,126.0,151.0,139.0,88.0,155.0
mean,1.793548,41.2,1.103226,1.506494,1.845161,1.350649,1.603896,1.792208,1.827586,1.583333,1.8,1.66,1.866667,1.88,1.427517,105.325397,85.89404,3.817266,61.852273,1.451613
std,0.40607,12.565878,0.30524,0.501589,0.362923,0.47873,0.490682,0.407051,0.379049,0.494727,0.40134,0.475296,0.341073,0.32605,1.212149,51.508109,89.65089,0.651523,22.875244,0.499266
min,1.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3,26.0,14.0,2.1,0.0,1.0
25%,2.0,32.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,0.7,74.25,31.5,3.4,46.0,1.0
50%,2.0,39.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,58.0,4.0,61.0,1.0
75%,2.0,50.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.5,132.25,100.5,4.2,76.25,2.0
max,2.0,78.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0,295.0,648.0,6.4,100.0,2.0


In [28]:
# Exporting the dataframe to a csv file
df.to_csv('hepatitis_cleaned.csv')

## The McCulloh-pitt model

In [29]:
# changing the column values from 2/1 to 1/0

# 0 for die
# 1 for live
df['DIE_LIVE'] = df['DIE_LIVE'].replace(1,0)
df['DIE_LIVE'] = df['DIE_LIVE'].replace(2,1)
df.head(10)

Unnamed: 0,DIE_LIVE,AGE,SEX,STEROID,ANTIVIRAL,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,1,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,1,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,1,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,1,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,1,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1
5,1,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.9,95.0,28.0,4.0,75.0,1
6,0,51,1,1.0,2,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,,,,,,1
7,1,23,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,,,,1
8,1,39,1,2.0,2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.7,,48.0,4.4,,1
9,1,30,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,120.0,3.9,,1


In [30]:
# Here goes the mcculloh pitt model
#

## The Perceptron model

In [31]:
# Had to change the 0 values to -1 as the perceptron needs to have bipolar threshold activation fucb
df['DIE_LIVE'] = df['DIE_LIVE'].replace(0,-1)

In [32]:
df.head(10)

Unnamed: 0,DIE_LIVE,AGE,SEX,STEROID,ANTIVIRAL,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,1,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,1,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,1,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,1,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,1,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1
5,1,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.9,95.0,28.0,4.0,75.0,1
6,-1,51,1,1.0,2,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,,,,,,1
7,1,23,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,,,,1
8,1,39,1,2.0,2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.7,,48.0,4.4,,1
9,1,30,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,120.0,3.9,,1


In [33]:
y1 = []
y1 = np.array(df['DIE_LIVE'])

x1 = np.array(df.loc[:,['AGE','SEX','STEROID','ANTIVIRAL','FATIGUE','MALAISE','ANOREXIA','LIVER_BIG','LIVER_FIRM','SPLEEN_PALPABLE','SPIDERS','ASCITES','VARICES','BILIRUBIN','ALK_PHOSPHATE','SGOT','ALBUMIN','PROTIME','HISTOLOGY']])

In [34]:
#df.iat[6,0]   #check

In [36]:
# perceptron function, Is a bipolar threshold activation function
def perceptron(x):
    if x>3:
        return 1
    else :
        return -1

In [37]:
# getting the size of the features
[r,c] = x1.shape
print([r,c])

[155, 19]


In [50]:
# Training the perceptron
#it's a single neuron perceptron
lr = 0.04 # lr is the learning rate
# assigning random weight values 
w1 = np.random.rand(19)
#w
print('Prev. ',w1,'\n')
b1 = np.random.rand()   # random bias value
#b
X_train,X_test,y_train,y_test = train_test_split(x1,y1,test_size = 0.25, random_state = 31)
#X_train.shape
[r,c] = X_train.shape
epoch = 5431
e = 0
while(e<=epoch):    
    for i in range(r):
        s1 = 0
        for j in range(c):
            s1+= X_train[i,j]*w1[j]
        s1+=b1
        out = perceptron(s1)
        if(out == y_train[i]):
            ;
        else:
            for j in range(c):
                w1[j] = w1[j] + (lr*X_train[i,j]*y_train[j])
    e+=1


print('New ',w1,'\n')

Prev.  [0.8027495  0.2635215  0.80192183 0.0049209  0.94529576 0.58136957
 0.56225174 0.59200725 0.94627014 0.57454866 0.75292897 0.08273552
 0.1236063  0.15634394 0.74784444 0.52819124 0.40422737 0.25846775
 0.28931507] 

New  [-748094.23725062   21076.42352174              nan   33895.68492129
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan              nan              nan
              nan              nan   26073.88931536] 



In [47]:
# Testing the perceptron
[r,c] = X_test.shape
y_pred1 = []
for i in range(r):
    s1 = 0
    for j in range(c):
        s1+= X_test[i,j]*w1[j]
    s1+= b1
    out = perceptron(s1)
    y_pred1.append(out)
    

#print(y_pred1)    
#from sklearn.metrics import accuracy_score
#accuracy_score(y_test,y_pred)
print('The Accuracy of the perceptron model is :%.2f'%accuracy_score(y_test,y_pred1))

#for i in range(r):
    
    
print('misclassified samples: %d'%(y_test!=y_pred1).sum())#compute
#print("The accuracy of the perceptron is :",accuracy_score)


The Accuracy of the perceptron model is :0.90
misclassified samples: 4


## Back propagation algorithm in NN

> - __The data used is the same as used in the previous model, The data is cleaned and the values of DIE_LIVE are changed back to 2/1 from 1,-1.__

> - __IN this model, For DIE_LIVE column: -1 means DIE and 1 means LIVE.__

> - __This model will logsigmoid function as its acativation function and the BPA alogrithm for learning.__

> - __This model will also implement a single neuron neural network,__

In [149]:
# The activation function for the model
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [152]:
# BPA code for training the model:
lr = 0.06                     # Learning rate
w = np.random.rand(19)        # Randomly assigning the weight values.
b = np.random.rand()          # Randomly assigning the bias values. 

#changing the 'DIE_LIVE' col. values back to the previous ones
df['DIE_LIVE'] = df['DIE_LIVE'].replace(1,2)
df['DIE_LIVE'] = df['DIE_LIVE'].replace(-1,1)
print(w,'\n',b,'\n')
# assigning the features
x = np.array(df.loc[:,['AGE','SEX','STEROID','ANTIVIRAL','FATIGUE','MALAISE','ANOREXIA','LIVER_BIG','LIVER_FIRM','SPLEEN_PALPABLE','SPIDERS','ASCITES','VARICES','BILIRUBIN','ALK_PHOSPHATE','SGOT','ALBUMIN','PROTIME','HISTOLOGY']])
#assigning the outputs
y = np.array(df['DIE_LIVE'])

print(x)

# splitting the data into a set of the training and testing of 75 and 25% 
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size =0.25, random_state = 31)

#X_train.shape
[r,c] = X_train.shape
print(r,c,'\n')

epoch = 5 #epochs
e = 0
while(e<=epoch):
    for i in range(r):
        s = 0
        for j in range(c):
            s+= X_train[i,j]*w[j]  # Forward propagation
        s+= b
        #print(s,'\n')
        output = sigmoid(s)             # sigmoid function will always give error
        # Calculating the delta value
        error = output*(1-output)*(y_train[j]-output)
        # Updating the weights
        for j in range(c):
            w[j]+=(lr*error*output)
    e+=1 
    
print(w)


[0.17424195 0.24897227 0.97825654 0.85929669 0.22334034 0.96130985
 0.06390705 0.84006498 0.15818718 0.27709682 0.67227308 0.42513945
 0.55460805 0.32829564 0.61453603 0.50392639 0.6695683  0.21263526
 0.82512002] 
 0.25925383214724096 

[[30.   2.   1.  ...  4.   nan  1. ]
 [50.   1.   1.  ...  3.5  nan  1. ]
 [78.   1.   2.  ...  4.   nan  1. ]
 ...
 [61.   1.   1.  ...  4.1  nan  2. ]
 [53.   2.   1.  ...  4.1 48.   2. ]
 [43.   1.   2.  ...  3.1 42.   2. ]]
116 19 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan 

nan

In [55]:
# testing the model
[r,c] = X_test.shape
y_pred = []
for i in range(r):
    s = 0
    for j in range(c):
        s+= X_test[i,j]*w[j]
    s+=b
    out = sigmoid(s)
    y_pred.append(out)

#print(accuracy_score(y_test,y_pred))
#print('The Accuracy of the BPA model is :%.2f'%accuracy_score(y_test,y_pred))
#print('misclassified samples: %d'%(y_test!=y_pred).sum())

> __The BPA algorithm also uses the Logsigmoid function as its activation function and the expected output from the model according to the dataset needs be strictly 2.0/1.0 but because of the sigmoid function it would be quite impossible to get exactly 2.0/1.0 as output hence the model is not appropriate for the given dataset however to get it right we could either use the threshold activation function or we could just compare the predicted outputs with the expected ones to check how near could we get using the model. In this project we have employed the second method to see how close we got to the expected outputs.__

In [56]:
# Comparing the outputs
print('Predicted_output, Expected output')
for i in range(c):
    print(y_pred[i],'  ,   ',y_test[i])
    print('\n')

Predicted_output, Expected output
nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    -1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    1


nan   ,    -1


nan   ,    1


nan   ,    1




# References
 - http://shodhganga.inflibnet.ac.in/bitstream/10603/183728/12/12_chapter%20-%20v.pdf