In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#Input data files are available in the "../input/" directory.
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.

Loadup the dataset and explore it...

In [None]:
df = pd.read_csv("../input/Indian Liver Patient Dataset (ILPD).csv")
print(df.columns) # gives us the names of the features in the dataset that might help predict if patient has a disease.
df.describe()

In [None]:
df.info()

In [None]:
df['alkphos'].fillna(value=0, inplace=True)

Let us look at the conditional statistics of the data conditioned on whether someone has a liver disease or not for each of the features for to get a feel for how useful they will be in discriminating if someone has a liver disease.

In [None]:
data = df.values
feat_names = df.columns
df_neg = df.loc[df[feat_names[-1]] == 1]
df_neg.describe()

In [None]:
df_pos = df.loc[df[feat_names[-1]] == 2]
df_pos.describe()

Now let us see the correlations among different features(test measurements) conditioned on someone having a liver disease or not!

In [None]:
sns.heatmap(df_neg.corr())


In [None]:
sns.heatmap(df_pos.corr())

In [None]:
sns.heatmap(df.corr())

Looking at the heatmap of the correlations it seems like some tests seem to have a relatively strong positive correlation to someone having a disease. Lets see how accurately we can predict if someone has liver disease just by looking at each of these tests individually by plotting the class conditional histograms of the data..

In [None]:
print(np.array(df_neg.values)[:,2].shape)
print(np.array(df_pos.values)[:,2].shape)
for i in range(2,len(df.columns)):
    sns.distplot((np.array(df_neg.values)[:,i]),color='b')
    sns.distplot((np.array(df_pos.values)[:,i]),color='r')
    plt.figure()

As seen from the figure above it will be difficult to guess if someone has the disease by looking at the individual test results themselves... since there is a significant overlap in the distributions indicating that the dataset is not linearly separable on any individual axes.....

But, will  a combination of these tests help make better  predictions? 
To simplostically answer this, let us try to fit the class condistional distributions and use them to discriminate the data... A glance at the histgrams is hinting that almost all the tests are having unimodal class conditonal distributions.. so let us fit a multivariate gaussian distribution to samples drawn from each of these classes. And use the Mahalanobis distance to tell if someone has a disease or not.. using all the features (except the gender provided..)

In [None]:
neg_meas = np.array(df_neg.values)[:,2:-2].astype('float')
neg_mean = np.mean(neg_meas, axis=0)
neg_cov  = np.cov(neg_meas, rowvar=0)
neg_precision = np.linalg.inv(neg_cov)

pos_meas = np.array(df_pos.values)[:,2:-2].astype('float')
pos_mean = np.mean(pos_meas, axis=0)
pos_cov  = np.cov(pos_meas, rowvar=0)
pos_precision = np.linalg.inv(pos_cov)

#for i in range(len)
TP = 1
TN = 1
FP = 1
FN = 1
NEG = 0
POS = 0
for i in range(len(df.values)):
    meas = np.array(df.values[i])[2:-2]
    neg_diff = meas - neg_mean
    neg_dist = np.sqrt(np.dot(np.transpose(neg_diff), np.dot(neg_precision, neg_diff)))
    pos_diff = meas - pos_mean
    pos_dist = np.sqrt(np.dot(np.transpose(pos_diff), np.dot(pos_precision, pos_diff)))
    if((pos_dist/neg_dist) < 1):
        pred = 2
    else:
        pred = 1
    if(df.values[i][-1] == 1):
        NEG += 1
        if(pred == df.values[i][-1]):
            TN += 1
        else:
            FP += 1
    else:
        POS += 1
        if(pred == df.values[i][-1]):
            TP += 1
        else:
            FN += 1

conf_matrix = np.array([[TP,FP],[FN,TN]])
sns.heatmap(conf_matrix)
print(conf_matrix)
print(TP+FN+TN+FP)
precision = (TP*1.0)/(TP+FP)
recall    = (TP*1.0)/(TP+FN)
F_score  = (2.0*precision*recall)/(precision+recall)
print(precision)
print(recall)
print('F-Score : ' + str(F_score))
        

In [None]:
def get_fscore(cost):
    TP = 1
    TN = 1
    FP = 1
    FN = 1
    NEG = 0
    POS = 0
    for i in range(len(df.values)):
        meas = np.array(df.values[i])[2:-2]
        neg_diff = meas - neg_mean
        neg_dist = np.sqrt(np.dot(np.transpose(neg_diff), np.dot(neg_precision, neg_diff)))
        pos_diff = meas - pos_mean
        pos_dist = np.sqrt(np.dot(np.transpose(pos_diff), np.dot(pos_precision, pos_diff)))
        if((pos_dist/neg_dist) < cost):
            pred = 2
        else:
            pred = 1
        if(df.values[i][-1] == 1):
            NEG += 1
            if(pred == df.values[i][-1]):
                TN += 1
            else:
                FP += 1
        else:
            POS += 1
            if(pred == df.values[i][-1]):
                TP += 1
            else:
                FN += 1
    precision = (TP*1.0)/(TP+FP)
    recall    = (TP*1.0)/(TP+FN)
    F_score  = (2.0*precision*recall)/(precision+recall)
    return F_score
cost_vec = []
fscore_vec =[]
for i in range(50):
    cost =  i*0.2
    cost_vec.append(cost)
    fscore = get_fscore(cost)
    fscore_vec.append(fscore)
plt.plot(np.array(cost_vec), np.array(fscore_vec))
plt.xlabel('relative cost of error')
plt.ylabel('F-Score')


From this it looks like there is not much significant improvement, even when we use all the measurements linearly, for any relative cost... So let us try some non-linear discriminative models now....