# Using PCA to Improve Classification Results


# 1. Loading Libraries

In [None]:
# pandas and numpy for data manipulation
import numpy as np
import pandas as pd
# importing matplotlib to support visualization
import matplotlib.pyplot as plt

# importing PCA from sklearn
from sklearn.decomposition import PCA

# importing train_test_split to create HOld-Out enviornment
from sklearn.model_selection import train_test_split
# imports Decision tree classifier
from sklearn import tree 
# imports sklearn builtin metrices
from sklearn import metrics
# imports preprocessing functions from sklearn
from sklearn import preprocessing


# 2. Loading Data set

In [None]:
# reading data set
dataset = pd.read_csv("Data sets/heart.csv")
dataset.info()
dataset.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


Data set has 303 rows and 14 features. The feature target is a class variable. The statistical properties of feature values differ a lot. Hence, it is important to first standardize the data set. 

# 3. Standardize Data set &  Creating Train and Test sets under Hold-Out Method

In [None]:
# My_data contains all data points from My_data set from from 
# first feature to 12th feature(indicator features)
My_data = dataset.iloc[:,0:13] 

# Standardizing data set using preprocessing library in sklearn
Standardize_dataset= preprocessing.StandardScaler()
My_dataset = Standardize_dataset.fit_transform(My_data)
My_dataset=pd.DataFrame(My_dataset)


# My_target contains class information which is 13th feature in the
#data set

My_data_target=dataset.iloc[:,13]

# creating train and test data sets

X_train, X_test, Y_train, Y_test = train_test_split(My_dataset, My_data_target, test_size=0.8, random_state=10)

print("The sample training data without target feature\n")
print(X_train.head())
print("\nThe sample with only target feature\n")
print(Y_train.head())

The sample training data without target feature

           0         1         2         3         4         5         6   \
134 -1.474158 -1.468418  0.032031 -0.321189  1.154427 -0.417635  0.898962   
143  1.393352 -1.468418 -0.938515 -1.463447 -0.449589 -0.417635  0.898962   
253  1.393352  0.681005 -0.938515 -1.806125  1.019148 -0.417635 -1.005832   
205 -0.260980  0.681005 -0.938515 -0.206964  0.168827 -0.417635  0.898962   
266  0.069886 -1.468418 -0.938515  2.762907  1.560262 -0.417635  2.803756   

           7         8         9         10        11        12  
134  0.583939 -0.696631 -0.896862  0.976352 -0.714429 -0.512922  
143 -0.334401 -0.696631 -0.638053  0.976352  1.244593 -0.512922  
253 -1.077820  1.435481 -0.120436 -0.649113  1.244593 -0.512922  
205  0.496478  1.435481 -0.896862  0.976352  0.265082  1.123029  
266 -1.427664  1.435481  2.036303 -0.649113 -0.714429 -0.512922  

The sample with only target feature

134    1
143    1
253    0
205    0
266    0
Name: tar

The feature values are standardized.  

# 4. Building Decision Tree Classifier using Train Set

In [None]:
#Create a Decision tree Classifier

DTmodel_1 = tree.DecisionTreeClassifier() 

#Train the model using the training sets

DTfitted_1 = DTmodel_1.fit(X_train, Y_train)



# 5. Testing Model Performance on Test Set

In [None]:

#Predict the response on the test data set
DT_predictions_1 = DTfitted_1.predict((X_test))


# 6. Evaluating Model Performance

In [None]:
# Computing Model Accuracy

print("Accuracy:",round(metrics.accuracy_score(Y_test, DT_predictions_1),2) * 100, "%")

print ("---------------")

# Printing confusion matrix

print ("Confusion matrix")

print ("---------------")

print(metrics.confusion_matrix(Y_test, DT_predictions_1))

# Model detailed classification report
target_names = ['class 0', 'class 1']


print ("---------------")

print("Classification report", metrics.classification_report(Y_test, DT_predictions_1,target_names =target_names))

Accuracy: 69.0 %
---------------
Confusion matrix
---------------
[[72 43]
 [33 95]]
---------------
Classification report               precision    recall  f1-score   support

     class 0       0.69      0.63      0.65       115
     class 1       0.69      0.74      0.71       128

    accuracy                           0.69       243
   macro avg       0.69      0.68      0.68       243
weighted avg       0.69      0.69      0.69       243



Accuracy of the model has turned out to be 63%. It is a weak performance. In the next step, we will first transform data set using PCA and the classfier model will be retrained on the data set transformed by PCA. 

# 7. Applying PCA

In [None]:
# Applying PCA with n_components set to .80. It means that we want 
#PCA to perform dimensionality reduction such that 80% variance of
# the original data set is preserved. 
pca = PCA(n_components=.80) 
# fitting data set
pca.fit(My_dataset)
# transforming data set using new PC's discovered by PCA
data_new = pca.transform(My_dataset)
Data_new=pd.DataFrame(data_new)


# 8. Printing Variance Explained by each PC 

In [None]:
# It will print number of PC's discovered by PCA
print("Number of PC's discovered by PCA:",pca.components_.shape[0] )
print(".............................")
# It will print total variance explained by all PC's
print("Total variance explained by %d Pc's is  %2.2f %% \n "%(pca.components_.shape[0]
     ,np.sum(pca.explained_variance_ratio_)*100))
print(".............................")
print("Printing discovered PC's \n")
All_PCs = pd.DataFrame(pca.components_)
print(All_PCs.T)
print(".............................")
print("Printing size of components \n")
print(All_PCs.T.shape)

Number of PC's discovered by PCA: 9
.............................
Total variance explained by 9 Pc's is  85.03 % 
 
.............................
Printing discovered PC's 

           0         1         2         3         4         5         6  \
0   0.314203  0.406149 -0.094077 -0.020662 -0.307153 -0.128296 -0.223730   
1   0.090838 -0.377792  0.554849 -0.255309  0.050704  0.054969 -0.162507   
2  -0.274607  0.297266  0.356974  0.287900  0.163179 -0.193411 -0.215390   
3   0.183920  0.438187  0.203849  0.022601  0.188138 -0.179460  0.332763   
4   0.117375  0.364514 -0.407825 -0.343410  0.320067 -0.104730  0.049329   
5   0.073640  0.317433  0.481736 -0.068605 -0.233442  0.249614  0.510818   
6  -0.127728 -0.220882 -0.089191  0.266096 -0.393667 -0.666813  0.396896   
7  -0.416498  0.077876  0.158255 -0.184125  0.323284 -0.120984  0.101473   
8   0.361267 -0.263118 -0.126356 -0.115056  0.034536  0.230699  0.449919   
9   0.419639 -0.052255  0.110343  0.326296  0.250579 -0.170080 -0.1

PCA disovered 9 PC's that explained 85% of the original data set. The resulted components are of size 13 x 9. Where every PC of total 9 discovered is formed by linear combination of 13 feature present in the data set.

# 9. Creating Train and Test set on transformed Dataset by PCA

In [None]:

X_train_1, X_test_1, Y_train_1, Y_test_1 = train_test_split(Data_new, My_data_target, test_size=0.8, random_state=10)

print("The sample training data without target feature\n")
print(X_train_1.head())
print("\nThe sample with only target feature\n")
print(Y_train_1.head())

The sample training data without target feature

            0         1         2         3         4         5         6  \
134 -2.225653  0.273038 -1.668147 -0.090445  0.276002 -0.415002  0.768777   
143 -0.430092  0.163350 -1.620222 -0.293148 -2.576352 -0.511867 -0.906733   
253  2.032234 -0.567301 -1.136519 -0.996070 -0.947685  1.382028 -1.317668   
205 -0.039806 -1.447947 -0.409823 -1.789855 -0.278015 -0.690443  1.056453   
266  2.353844  0.740692 -2.010684  1.921939  0.163153 -1.950980  2.769245   

            7         8  
134  0.707948  0.408110  
143  0.418220  0.179649  
253  0.779066 -0.755247  
205 -0.412408 -0.092749  
266 -0.777634  0.967585  

The sample with only target feature

134    1
143    1
253    0
205    0
266    0
Name: target, dtype: int64


# 10. Creating Decision Tree classifier on the Train Set

In [None]:
#Create a Decision tree Classifier

DTmodel_2 = tree.DecisionTreeClassifier() 

#Train the model using the training sets

DTfitted_2 = DTmodel_2.fit(X_train_1, Y_train_1)

# 11. Testing Classifier on Test set

In [None]:

#Predict the response on the test data set

DT_predictions_2 = DTfitted_2.predict((X_test_1))

# 12. Evaluating Performance of Classifier

In [None]:
# Computing Model Accuracy

print("Accuracy:",round(metrics.accuracy_score(Y_test_1, DT_predictions_2),2) * 100, "%")

print ("---------------")

# Printing confusion matrix

print ("Confusion matrix")

print ("---------------")

print(metrics.confusion_matrix(Y_test_1, DT_predictions_2))

# Model detailed classification report
target_names = ['class 0', 'class 1']


print ("---------------")

print("Classification report", metrics.classification_report(Y_test_1, DT_predictions_2,target_names =target_names))

Accuracy: 74.0 %
---------------
Confusion matrix
---------------
[[85 30]
 [33 95]]
---------------
Classification report               precision    recall  f1-score   support

     class 0       0.72      0.74      0.73       115
     class 1       0.76      0.74      0.75       128

    accuracy                           0.74       243
   macro avg       0.74      0.74      0.74       243
weighted avg       0.74      0.74      0.74       243



As indicated by the results, the performance of the model has improved from 63% to 75% when PCA was used for transformation of data set 