We will load in the processed dataset from the previous notebook.

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# dislpay full column widths and all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [12]:
proc_df = pd.read_csv('/media/veracrypt3/Cloud/Datasets/Kaggle/heart_processed.csv')

In [13]:
proc_df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up
0,40,140,289.0,0,172,0.0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180.0,0,156,1.0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283.0,0,98,0.0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214.0,0,108,1.5,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195.0,0,122,0.0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


# Create X and Y datasets

In [14]:
X = np.asarray(proc_df.loc[:, proc_df.columns != 'HeartDisease'])  # select all columns except 'HeartDisease'
X[0:5]

array([[ 40. , 140. , 289. ,   0. , 172. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 49. , 160. , 180. ,   0. , 156. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 37. , 130. , 283. ,   0. ,  98. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 48. , 138. , 214. ,   0. , 108. ,   1.5,   1. ,   0. ,   1. ,
          0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ],
       [ 54. , 150. , 195. ,   0. , 122. ,   0. ,   0. ,   1. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ]])

In [15]:
Y = np.asarray(proc_df['HeartDisease'])
Y[0:5]

array([0, 1, 0, 1, 0])

In [16]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.43220634,  0.41462669,  0.94076249, -0.55173333,  1.38333943,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.47805725,  1.52635965, -0.99871403, -0.55173333,  0.75473573,
         0.10625149,  1.9368261 , -1.9368261 , -1.08542493, -0.48221041,
         1.88138352, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724,  0.99891008, -0.86988791],
       [-1.75025603, -0.14123979,  0.83400232, -0.55173333, -1.52395266,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627, -1.22697371,  2.0375685 ,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.58407381,  0.30345339, -0.3937397 , -0.55173333, -1.13107535,
         0.57512835,  1.9368261 , -1.9368261 ,  

# Train/test split dataset

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape, Y_train.shape)
print('Test set:', X_test.shape, Y_test.shape)

Train set: (733, 20) (733,)
Test set: (184, 20) (184,)


# Make polynomial model

In [18]:
poly = PolynomialFeatures(degree=2)  # we'll use degree 2 for this run, so squared
train_x_poly = poly.fit_transform(X_train)
train_x_poly

array([[ 1.        ,  0.68812497,  1.52635965, ...,  0.99782135,
        -0.86893981,  0.75670498],
       [ 1.        ,  0.15804214,  0.41462669, ...,  0.99782135,
        -0.86893981,  0.75670498],
       [ 1.        ,  0.47609184, -1.80883924, ...,  1.00218341,
        -1.1508277 ,  1.32151899],
       ...,
       [ 1.        ,  2.17235688,  0.30345339, ...,  1.00218341,
        -1.1508277 ,  1.32151899],
       [ 1.        , -0.16000755,  0.41462669, ...,  0.99782135,
        -0.86893981,  0.75670498],
       [ 1.        , -0.79610695, -1.25297276, ...,  1.00218341,
        -1.1508277 ,  1.32151899]])

In [20]:
clf = linear_model.LinearRegression()
train_y = clf.fit(train_x_poly, Y_train)

print('Coefficients:', clf.coef_)
print('Intercept:', clf.intercept_)

Coefficients: [ 9.18414251e+11  4.14270243e-02  3.38287296e-02 -2.99712294e-02
 -4.40540003e+12 -3.84229770e-02  3.08576313e-02 -1.94645484e+12
 -3.69648861e+12  2.05273384e+12 -5.65092104e+12  2.76124119e+12
  9.46899480e+12 -1.81039529e+12  3.50385863e+12 -3.90875920e+12
 -2.90480207e+12  1.57902477e+12 -7.38381977e+12 -1.11606486e+13
 -1.02349326e+13  2.34375000e-02 -5.37109375e-03  4.76074219e-03
 -2.28881836e-03  1.34277344e-02  1.01928711e-02 -7.71652523e+11
 -7.71652523e+11  5.39047432e+11  4.23208727e+11  4.48305225e+11
  2.36120171e+11 -3.89986656e+12 -4.73067813e+12 -3.82066781e+12
  5.16720050e+10  5.16720050e+10  7.94904991e+11  1.57128547e+12
  1.55614411e+12  1.22070312e-03  1.56555176e-02 -1.08642578e-02
 -2.13012695e-02  1.06201172e-02 -8.72287504e+11 -8.72287504e+11
 -4.69244640e+12 -3.68406219e+12 -3.90252901e+12 -2.05544295e+12
  2.82044232e+12  3.42129779e+12  2.76316459e+12 -1.93702608e+11
 -1.93702608e+11 -8.25005331e+11 -1.63078469e+12 -1.61506998e+12
  7.0800781

# Evaluation

In [26]:
test_x_poly = poly.fit_transform(X_test)
test_y = clf.predict(test_x_poly)

print(f'Mean absolute error: {np.mean(np.absolute(test_y - Y_test)):.2f}')
print(f'Residual sum of squares (MSE): {np.mean((test_y - Y_test) ** 2):.2f}')
print(f'R2-score: {r2_score(Y_test, test_y)}')

Mean absolute error: 0.27
Residual sum of squares (MSE): 0.15
R2-score: 0.3938756546797282


Our R2 score is only 0.39, which is not very good, so 2 degree polynomial regression doesn't seem to perform as well as just linear regression.