In [None]:
# update for tensorflow
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np
import seaborn as sns
import random as rn
import re
import warnings
import csv

import tensorflow as tf
# Force TensorFlow to single thread
# Multiple threads are a potential source of non-reprocible research resulsts
session_conf = tf.compat.v1.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1 )

# tf.set_random_seed() will make random number generation in the TensorFlow backend
# have a well defined initial state
# more details: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.compat.v1.set_random_seed(515)

# keras / deep learning libraries
from tensorflow import keras
from tensorflow.keras.models import Sequential # a sequence of neuronal layers
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Dense # layer representing a neuron
from tensorflow.keras.optimizers import Nadam # optimisation algorithm to find the best weights in the model
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.utils import plot_model

# callbacks
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.image as mpimg
import pylab as pl
from pylab import savefig
plt.style.use('seaborn-deep')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler,MinMaxScaler
from sklearn.metrics import roc_curve, auc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Figure Plotting libraries
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import pyplot
import seaborn as sns
sns.set()

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# mlp for the two circles classification problem
from sklearn.datasets import make_circles
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.optimizers import SGD
from keras.initializers import RandomUniform

In [None]:
# #############################################################################
# Plot results

def myplot(score,coeff, y, labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = y)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
            
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()

In [None]:
# breast data
breast_data = pd.read_csv("data/breast_data_full.csv")
breast_data = breast_data.drop(["id"], axis=1)
breast_data

We have a total of 30 features.

You start by Standardizing the data since PCA's output is influenced based on the scale of the features of the data.

In [None]:
from sklearn.preprocessing import StandardScaler

features = breast_data.drop("diagnosis", axis=1).columns.tolist()

X = breast_data.loc[:, features].values
X = StandardScaler().fit_transform(x) # normalizing the features

# putting scalled data into dataframe format
normalised_data = pd.DataFrame(X,columns=features)
normalised_data["diagnosis"] = diagnosis
normalised_data

In [None]:
np.mean(X),np.std(X)

## PCA - Principal Component Analysis

When we have a dataset with a lot features, there usually are lots of features that are either redundant or noisy. For dimensionality reduction, it is important to identify these types of features, otherwise we might end up removing important information about our data.

**In other words, we should be removing features that will not impact the patterns in data or the prediction results**

Principal Component Analysis (PCA) is an unsupervised learning method that can help us reduce the dimensionlity of the space without impacting too much on the prediction results. PCA is especially important for a problem called *The Curse of Dimensionality* 

### The Curse of Dimensionality

The Curse of Dimensionality refers to various phenomena that arise when analysing and organising data in high-dimensional spaces, but does not occur in low-dimensional settings. Due to the curse of dimensionality, search algorithms suffer from an exponential decrease of performance as the dimension of the metric space increases. Here are two examples of the problem of the curse of dimensinality. 

<img src="./graphics/cd2.png" width="700" />

<img src="./graphics/cd1.png" width="700" />

### Correlation Analysis

In [None]:
corr = normalised_data.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(10, 7))
    ax = sns.heatmap(corr, mask=mask, vmax=1.0, square=True, cmap="YlGnBu")




Things to take into consideration when doing a correlation analysis:
- We prefer variables that are highly correlated with the class variable (in this case, with "diagnosis");
- We want to avoid input variables that are correlated with each other. This means that the variables share similar characteristics and for that reason are redundant. In this case, we should consider removing variables with a correlation higher than 0.6.



### A Method to Search for Important Features

In [None]:
from sklearn.ensemble import RandomForestRegressor
df = normalised_data.drop("diagnosis", axis=1)
model = RandomForestRegressor(random_state=1, max_depth=10)
df=pd.get_dummies(df)
model.fit(df, normalised_data["diagnosis"])

In [None]:
NUM_FEATURES = 10

features = normalised_data.columns
importances = model.feature_importances_
indices = np.argsort(importances[0:NUM_FEATURES])  # top features

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Dimensionality Reduction with PCA

In [None]:
#from statsmodels.multivariate.pca import PCA
from sklearn.decomposition import PCA

pca = PCA( ) # you can specify a number of principal components using n_components
pca_result = pca.fit_transform( normalised_data.drop("diagnosis", axis=1)   )
pd.DataFrame(pca_result) # principle components

In [None]:
index = np.arange(len(pca.explained_variance_ratio_))
plt.figure(figsize=(14,6))
plt.title('Principal Component Analysis')
plt.bar(index, pca.explained_variance_ratio_*100)
plt.xlabel('Principal Component', fontsize=10)
plt.ylabel('Explained Variance', fontsize=10)
plt.show()

#### Method 1: Eigenvalues

In [None]:
# we look at the eigen values and we discard all the components with eigenvalues smaller than 1
# this is known as the Kaiser criterion. 

# in this Python library, the closest we get to the eigen values is the singular values
pca.singular_values_ 

We can remove three features from this data according to Kaiser criterion. Maybe this is not enough, so let's look at another way.

#### Method 2: Explanation Variance

The explanation variance is a methd that allows you to pick the total amount of variance in the dataset you want principal components to encode. Usually these cut-off values are 80% or 90%.


In [None]:
# the cummulative variance can allow us to choose the components with the biggest variance
cummulative_var = np.cumsum(pca.explained_variance_ratio_)
cumulative_variance_explained = pd.DataFrame(data=cummulative_var, columns=['cumulative_var'])
print(cumulative_variance_explained)

If we were to use this method, we have:
- The first four principal components explain a bit more than 80% of the variance,
- The first six principal components would roughly explain 90% of the variance in our dataset.

We can use the first four or first six principal components, depending on the cut-off that we established. For this tutorial, let's use the first four. One can see this clearer in the following visualisation

In [None]:
plt.figure(figsize=(14,8))
plt.plot(range(len(pca.components_)), pca.explained_variance_ratio_)
plt.plot(range(len(pca.components_)), np.cumsum(pca.explained_variance_ratio_))
plt.title("PCA - Cumulative Explained Variance vs. Component-Explained Variance ")
plt.legend(("Component - Explained Variance","Cumulative Sum - Explained Variance"))

#### Finalising Analysis

We saw that 4 components are enough to express 80% of our data, so let's apply PCA for 4 compoenents

In [None]:
pca = PCA( n_components = 4 )  
pca_result = pca.fit_transform( normalised_data.drop("diagnosis", axis=1)   )
df_res = pd.DataFrame(pca_result, columns=["principal_component_1","principal_component_2","principal_component_3","principal_component_4"]) # principle components
df_res

Remember in PCA we lose interpretability. By projecting the data into the component with the highest variance, you lose information and consequently interpretability.

In [None]:
# interpretability
import statsmodels.multivariate.pca as st

pca2 = st.PCA( normalised_data.drop("diagnosis", axis=1) , ncomp = 4, method='eig' )
loadings = pca2.loadings
loadings

### PCA on the Web




<img src="./graphics/pca.png" width="700" />

URL here: <a href="https://setosa.io/ev/principal-component-analysis/">PCA example</a>