In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         294 non-null    int64  
 1   sex         294 non-null    int64  
 2   cp          294 non-null    int64  
 3   trestbps    294 non-null    object 
 4   chol        294 non-null    object 
 5   fbs         294 non-null    object 
 6   restecg     294 non-null    object 
 7   thalach     294 non-null    object 
 8   exang       294 non-null    object 
 9   oldpeak     294 non-null    float64
 10  slope       294 non-null    object 
 11  ca          294 non-null    object 
 12  thal        294 non-null    object 
 13  num         294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB


In [4]:
df.isnull().sum()

age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
num           0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,age,sex,cp,oldpeak,num
count,294.0,294.0,294.0,294.0,294.0
mean,47.826531,0.72449,2.982993,0.586054,0.360544
std,7.811812,0.447533,0.965117,0.908648,0.480977
min,28.0,0.0,1.0,0.0,0.0
25%,42.0,0.0,2.0,0.0,0.0
50%,49.0,1.0,3.0,0.0,0.0
75%,54.0,1.0,4.0,1.0,1.0
max,66.0,1.0,4.0,5.0,1.0


In [14]:
y = df.iloc[:, -1]
df = df.iloc[:, :-1]
x = df.drop(['trestbps','chol','fbs','restecg','thalach','exang','slope','ca','thal'], axis=1)

In [16]:
x.head()

Unnamed: 0,age,sex,cp,oldpeak
0,28,1,2,0.0
1,29,1,2,0.0
2,29,1,2,0.0
3,30,0,1,0.0
4,31,0,2,0.0


In [17]:
x_norm = StandardScaler().fit_transform(x)
x_norm = pd.DataFrame(x_norm)
x_norm.head()

Unnamed: 0,0,1,2,3
0,-2.542347,0.61667,-1.020259,-0.646074
1,-2.414117,0.61667,-1.020259,-0.646074
2,-2.414117,0.61667,-1.020259,-0.646074
3,-2.285888,-1.621613,-2.05817,-0.646074
4,-2.157658,-1.621613,-1.020259,-0.646074


In [18]:
summary = sm.OLS(y, sm.add_constant(x_norm)).fit()
print(summary.summary())

                            OLS Regression Results                            
Dep. Variable:             num          R-squared:                       0.430
Model:                            OLS   Adj. R-squared:                  0.422
Method:                 Least Squares   F-statistic:                     54.56
Date:                Fri, 16 Jul 2021   Prob (F-statistic):           3.13e-34
Time:                        18:05:17   Log-Likelihood:                -118.78
No. Observations:                 294   AIC:                             247.6
Df Residuals:                     289   BIC:                             266.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3605      0.021     16.912      0.0

In [19]:
ds = x_norm.drop(columns=[0])
ds

Unnamed: 0,1,2,3
0,0.616670,-1.020259,-0.646074
1,0.616670,-1.020259,-0.646074
2,0.616670,-1.020259,-0.646074
3,-1.621613,-2.058170,-0.646074
4,-1.621613,-1.020259,-0.646074
...,...,...,...
289,0.616670,1.055562,2.109958
290,-1.621613,0.017652,-0.646074
291,0.616670,1.055562,2.661164
292,-1.621613,-1.020259,0.456339


In [20]:
x_train, x_test, y_train, y_test = train_test_split(ds, y, test_size=0.5, random_state=0)

lda = LinearDiscriminantAnalysis()
lda_ = lda.fit(x_train, y_train)

In [21]:
lda.decision_function(x_train)

array([ 1.16145329,  1.16145329, -1.97373466, -0.56648845, -3.38098086,
        1.16145329, -2.8423094 , -3.38098086,  0.61357408, -3.38098086,
       -0.56648845, -0.56648845, -3.38098086, -3.38098086,  1.16145329,
       -3.38098086,  1.16145329,  2.02542415, -4.2495556 , -2.8423094 ,
       -4.2495556 , -1.97373466,  2.88939502, -1.97373466,  2.88939502,
        2.02542415, -1.97373466,  1.80284435, -0.56648845,  0.29287855,
       -1.11436766, -3.38098086, -3.38098086, -0.56648845, -3.38098086,
        1.16145329, -2.8423094 ,  8.07322024, -1.97373466, -1.97373466,
       -2.52161386, -3.38098086, -3.38098086,  2.02082028, -1.43506319,
       -2.8423094 ,  2.02542415, -3.38098086, -3.38098086, -3.38098086,
       -2.52161386,  4.61733676, -4.2495556 , -3.38098086,  2.88939502,
       -3.38098086, -4.2495556 , -1.43506319, -0.24579292, -1.97373466,
       -0.24579292, -4.2495556 ,  1.16145329, -2.52161386,  3.75336589,
       -3.38098086, -1.65303912, -1.33234359, -0.56648845, -0.56

In [22]:
pred = lda.predict(x_test)

print(pred)

[0 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 1 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 1
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1]


In [23]:
print(classification_report(y_test,pred))
print('Accuracy :', metrics.accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86        92
           1       0.81      0.64      0.71        55

    accuracy                           0.81       147
   macro avg       0.81      0.77      0.79       147
weighted avg       0.81      0.81      0.80       147

Accuracy : 0.8095238095238095
