In [63]:
# Header File

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
from sklearn.feature_selection import SelectFromModel


In [64]:
## PCA 
# Load Raw Data

(X, Y) = load_breast_cancer(return_X_y=True)

# Dimension reduction

sc = StandardScaler()
Z = sc.fit_transform(X)
# Estimate the correlation matrix
R = np.dot(Z.T, Z) / X.shape[0]
#calculate the eigen values, eigen vectors
eigen_vals, eigen_vecs = np.linalg.eigh(R)
# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(reverse=True)

W_3D = np.hstack((eigen_pairs[0][1][:, np.newaxis],
                  eigen_pairs[1][1][:, np.newaxis]))

Z_pca3 = Z.dot(W_3D)

X_train, X_test, y_train, y_test = train_test_split(Z_pca3, Y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model 
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred)/X_train.shape[1]))



Accuracy: 0.91
Accuracy per feature: 0.46


In [65]:
## Select importance
# Load Raw Data

init_data = load_breast_cancer()
(X, Y) = load_breast_cancer(return_X_y=True)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, Y_train)

Y_pred = forest.predict(X_test)

importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]


sfm = SelectFromModel(forest, threshold=importances[indices[1]], prefit=True)
Z_forest_alt = sfm.transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(Z_forest_alt, Y, test_size=0.3, random_state=0)

temp = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
temp.fit(X_train, Y_train)

Y_pred = temp.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(Y_test, Y_pred)/X_train.shape[1]))




Accuracy: 0.89
Accuracy per feature: 0.44
