In [12]:
%matplotlib qt4
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
from prettytable import PrettyTable
from sklearn.decomposition import KernelPCA as kpca
from sklearn.metrics import roc_curve, auc,roc_auc_score,f1_score,confusion_matrix
import numpy as np
print (__doc__)

Automatically created module for IPython interactive environment


### Loading Data

In [13]:
filename="../preprocessed/amlodipine.csv"
f = open(filename)
f.readline()  # skip the header
data =  np.loadtxt(fname = f, delimiter = ',',dtype='double')
Y = data[:,0]
X = data[:, 1:data.shape[1]-1]  # we only take the first two features.

In [14]:
print 'X: ',X
print 'Y: ',Y

X:  [[   1.      57.     158.    ...,    1.      70.       0.416]
 [   1.      65.     154.    ...,    1.      66.       0.376]
 [   1.      54.     150.    ...,    1.      70.       0.432]
 ..., 
 [   1.      65.     154.    ...,    2.      54.      11.53 ]
 [   1.      54.     150.    ...,    4.      50.       3.174]
 [   0.      70.     153.    ...,    3.      53.       9.53 ]]
Y:  [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  3.

In [15]:
#Normalizing to Zero Mean Unit Variance
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std

In [16]:
# Post Normalization
print 'Normalization gives: ',X

Normalization gives:  [[ 0.89834155 -0.08377075  0.01258515 ..., -0.89457402  0.18622843
  -0.59176057]
 [ 0.89834155  0.75292139 -0.3862672  ..., -0.89457402 -0.1275798
  -0.60886432]
 [ 0.89834155 -0.39753031 -0.78511956 ..., -0.89457402  0.18622843
  -0.58491907]
 ..., 
 [ 0.89834155  0.75292139 -0.3862672  ...,  0.14071951 -1.06900449
   4.1605154 ]
 [ 0.89834155 -0.39753031 -0.78511956 ...,  2.21130657 -1.38281272
   0.58754275]
 [-1.11316236  1.27585398 -0.48598029 ...,  1.17601304 -1.14745655
   3.30532807]]


### Principal Component Analysis of Dataset

In [17]:
# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(18, 10))
ax = Axes3D(fig, elev=-150, azim=110)
# Replace mle with any number of components desired for analysis

PCA_var = PCA(n_components='mle')
X_reduced =PCA_var.fit_transform(X)
print 'The variance contributed by each PC by order is: ',PCA_var.explained_variance_ratio_
print 'MLE Method Parameters Chosen: ',PCA_var.n_components_
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y)
# for Default colormap: Class 0: Blue Class 1: Green Class 2: Red
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
ax.legend(loc="lower right")
plt.show()

The variance contributed by each PC by order is:  [  1.60856910e-01   8.71654525e-02   6.86349911e-02   6.19210877e-02
   5.44891206e-02   5.23773270e-02   4.35501506e-02   4.08553483e-02
   3.85279818e-02   3.39005974e-02   3.15219521e-02   2.94100296e-02
   2.77903367e-02   2.40888363e-02   2.29576225e-02   2.16936247e-02
   2.14494516e-02   1.96518774e-02   1.82620053e-02   1.59779329e-02
   1.49285534e-02   1.38873896e-02   1.35743787e-02   1.26807516e-02
   1.06615542e-02   9.87597698e-03   9.39382435e-03   8.56138894e-03
   7.86226653e-03   6.45136679e-03   6.25794763e-03   5.62896147e-03
   5.05904596e-03   9.39580391e-05]
MLE Method Parameters Chosen:  34


### Using varying kernels allows for Non-linear manifold representations

## Kernel PCA

In [18]:
fig = plt.figure(1, figsize=(18, 10))
ax = Axes3D(fig, elev=-150, azim=110)
PCA_var= kpca(n_components=14,kernel='rbf')
X_reduced =PCA_var.fit_transform(X)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
plt.show()

### Classification

### Defining Confusion Matrix Function

In [21]:
class_labels_=np.array(['Group 1','Group 2','Group 3'])
def plot_confusion_matrix(cm,name,title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(str(title,' for Classifier: ',name))
    plt.colorbar()
    tick_marks = np.arange(len(class_labels_))
    plt.xticks(tick_marks,class_labels_, rotation=45)
    plt.yticks(tick_marks,class_labels_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [22]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

f = open(filename)
f.readline()  # skip the header
data =  np.loadtxt(fname = f, delimiter = ',',dtype='double')
Y = data[:,0]
X = data[:, 1:data.shape[1]-1]  # we only take the first two features.
names = ["Nearest Neighbors", "RBF SVM", "Decision Tree",
         "Random Forest", "Naive Bayes"]
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB()]

mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
# X= PCA(n_components=4).fit_transform(X)
# preprocess dataset, split into training and test part
# standardize

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.3)

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_predic=clf.predict(X_test)
    print 'Classifier: ',name,' Accuracy: ',score
    cm=confusion_matrix(y_test,y_predic)
    np.set_printoptions(precision=2)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure()
    plot_confusion_matrix(cm_normalized,name,title='Normalized confusion matrix')
    plt.show()


Classifier:  Nearest Neighbors  Accuracy:  0.870967741935


TypeError: str() takes at most 1 argument (3 given)