<a href="https://colab.research.google.com/github/sb2356-iiitr/ML_Projects/blob/main/PrincipalComponentAnalysis/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Principal Component Analysis
Implement PCA for the Breast Cancer Prediction and compare results

In [None]:
# Import libraries, load the dataset and create X and Y
from sklearn.datasets import load_breast_cancer
import pandas as pd

lbc = load_breast_cancer()

X = pd.DataFrame(lbc['data'], columns=lbc['feature_names'])
Y = pd.DataFrame(lbc['target'], columns=['type'])

In [None]:
print(lbc['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

### Perform prediction without PCA

In [None]:
# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size = 0.3, random_state = 1234,
                 stratify = Y)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rfc1 = RandomForestClassifier(random_state=1234)
rfc1.fit(X_train, Y_train)
Y_predict1 = rfc1.predict(X_test)

  """


In [None]:
# Score and evaluate the model
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(Y_test, Y_predict1)
cm1

array([[ 61,   3],
       [  0, 107]])

In [None]:
score1 = rfc1.score(X_test, Y_test)
score1

0.9824561403508771

### Perform PCA and compare the results

In [None]:
# Center the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled[:,0].mean()

-3.153111437248248e-15

In [None]:
# Import and implement PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Split the dataset into train and test
X_train, X_test, Y_train, Y_test = \
train_test_split(X_pca, Y, test_size = 0.3, random_state = 1234, stratify = Y)

In [None]:
# Default Random Forest Object
rfc2 = RandomForestClassifier(random_state = 1234)
rfc2.fit(X_train, Y_train)
Y_predict2 = rfc2.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Score and the evaluate the model
cm2 = confusion_matrix(Y_test, Y_predict2)
score2 = rfc2.score(X_test, Y_test)
print(cm2)
print(score2)

[[ 60   4]
 [  1 106]]
0.9707602339181286
