In [1]:
path = '/home/pramila/Desktop/ALL/DataSets/TumorData.csv'

In [2]:
import pandas as pd

In [3]:
data_ini = pd.read_csv(path)

In [4]:
data = data_ini.loc[:, ['diagnosis','radius_mean']]

In [5]:
data['radius_mean'] = data['radius_mean'] - data['radius_mean'].mean()

In [6]:
data['diagnosis'].replace(to_replace=['M','B'], value=[1,0], inplace=True)

### Various cases:

1. $\hat{\sigma}_B \neq \hat{\sigma}_B $ : In this case we use QDA


2. $\hat{\sigma}_B \approx \hat{\sigma}_B $ : In this case we use $\sigma_p$ which is pooled variance.

In [7]:
data['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [8]:
data.columns

Index(['diagnosis', 'radius_mean'], dtype='object')

$$\theta_0 = - \frac{\left(\hat{\mu}^2_0- \hat{\mu}^2_1\right)}{2 \hat{\sigma}^2}\ \ \ \ \theta_1 = \frac{\left(\hat{\mu}_0 - \hat{\mu}_1 \right)}{\hat{\sigma}^2}$$

In [9]:
sigma_m = data.loc[data['diagnosis']==1, 'radius_mean'].var()

In [10]:
sigma_b = data.loc[data['diagnosis']==0, 'radius_mean'].var()

In [11]:
print(sigma_m)
print(sigma_b)

10.265430814629347
3.1702217220438738


As we can clearly see, there is a big difference between variances of the two classes. So we will have to use QDA. But say, we still use LDA by taking pooled variance, how will this affect our results? Let's see:

In [12]:
m = data.loc[data['diagnosis']==1,:].shape[0]
b = data.loc[data['diagnosis']==0,:].shape[0]

In [13]:
pooled_variance = ((m*sigma_m) + (b*sigma_b))/(m+b)

In [14]:
pooled_variance

5.813779415590659

In [15]:
mean_m = data.loc[data['diagnosis']==1, 'radius_mean'].mean()

mean_b = data.loc[data['diagnosis']==0, 'radius_mean'].mean()

In [16]:
print(mean_m)
print(mean_b)

3.3355384487846935
-1.9807679303707428


In [17]:
theta1 = (mean_m - mean_b)/pooled_variance

theta0 = (mean_b**2 - mean_m**2)/pooled_variance

In [18]:
X = data['radius_mean'] 

In [19]:
def discriminant_function(point):
    value = theta0 + (point * theta1)
    if value > 0 :
        return 1
    else:
        return 0

In [20]:
for i in X:
    predicted_answers = discriminant_function(i)

In [21]:
actual_answers = data['diagnosis']

In [22]:
import numpy as np

In [23]:
correct_count = np.sum(np.equal(predicted_answers,actual_answers))

In [24]:
correct_count/data.shape[0]

0.6274165202108963

Our accuracy would have been more if we wouldn't have taken our variances same for both classes. Let's see that in QDA.