In [3]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import dendrogram

#Pull in iris dataset.
iris_dataset_url = 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv'
iris = pd.read_csv(iris_dataset_url)

print(iris.head())
print(iris.corr().round(2))

#Feature names
ColNs = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']

X = iris[ColNs]
X_sd = iris[ColNs]

#Standardize X_sd
for n in ColNs:
    val = (X[n] - np.mean(X[n])) / np.std(X[n])
    X_sd.loc[:, n] = val

#Determine PCA directions
pca = PCA(n_components=4).fit(X_sd)
Dirs = pca.components_

#Create summary output.
CompNs = ["Comp " + str(i) for i in range(1, 5)]
IdxNs = ['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion']
Summary = pd.DataFrame(index=IdxNs, columns=CompNs)
ProjectedData = np.dot(X_sd, np.transpose(Dirs))
Sds = np.std(ProjectedData, 0)
Vars = Sds ** 2
Summary.loc['Standard Deviation', :] = Sds
Summary.loc['Proportion of Variance', :] = Vars / sum(Vars)
Summary.loc['Cumulative Proportion', :] = np.cumsum(Summary.loc['Proportion of Variance', :])

print(Summary)

#Graph of cumulative variance explained
fig1 = plt.figure(1)
plt.plot(Vars)
plt.xticks(range(4), CompNs)
plt.show()

#2D Biplot
Ns = np.unique(iris['Name'])
Cs = ['blue', 'red', 'green']
fig2, ax = plt.subplots(num=2)

for i in range(4):
    y1 = Dirs[0, i]
    y2 = Dirs[1, i]
    ax.arrow(0, 0, y1, y2, head_width=0.05)
    ax.annotate(ColNs[i], (y1, y2))

for i in range(3):
    sel = list(iris.loc[iris['Name'] == Ns[i]].index)
    m = ProjectedData[sel, :]
    ax.scatter(m[:, 0], m[:, 1], c=Cs[i], s=100)

ax.set_xlabel('Comp 1')
ax.set_ylabel('Comp 2')
plt.show(block=True)

#3D Biplot
fig3 = plt.figure(3)
ax = fig3.add_subplot(111, projection='3d')

for i in range(3):
    sel = [x for x in range(150) if iris.loc[x, 'Name'] == Ns[i]]
    m = ProjectedData[sel, :]
    ax.scatter(m[:, 0], m[:, 1], m[:, 2], c=Cs[i], s=100)
    ax.set_xlabel('Comp 1')
    ax.set_ylabel('Comp 2')
    ax.set_zlabel('Comp 3')
for i in range(4):
    x = Dirs[0, i]
    y = Dirs[1, i]
    z = Dirs[2, i]
    ax.plot([0, x], [0, y], [0, z], c='black')
    ax.text(x, y, z, ColNs[i], color='black', size=20)
plt.show(block=True)

#KMeans clustering (applied to unstandardized data)
KM3 = KMeans(n_clusters=3)
KM3.fit(X)
labels = KM3.labels_
iris['KMeans'] = labels

#3D plot of PCA-projected points classified according to KMeans
fig4 = plt.figure(4)
ax = fig4.add_subplot(111, projection='3d')
for i in range(3):
    sel = [x for x in range(150) if labels[x] == i]
    m = ProjectedData[sel, :]
    ax.scatter(m[:, 0], m[:, 1], m[:, 2], c=Cs[i], s=100)
    ax.set_xlabel('Comp 1')
    ax.set_ylabel('Comp 2')
    ax.set_zlabel('Comp 3')
ax.set_title('Clusters according to K-Means')
plt.show(block=True)

#Hierarchical clustering (applied to unstandardized data)
linkage_matrix = linkage(X, "ward")

#3D plot of PCA-projected points classified according to hierarchical clustering
fig5 = plt.figure(5)
ddata = dendrogram(linkage_matrix)
plt.show(block=True)

iris['Ward'] = fcluster(linkage_matrix, 9, 'distance')

#Function to replicate R's table function
def table(x, y):
    xuni = np.unique(x)
    yuni = np.unique(y)
    res = pd.DataFrame(index=xuni, columns=yuni)
    for x1 in xuni:
        for y1 in yuni:
            res.loc[x1, y1] = sum((x == x1) & (y == y1))
    return res

#Print tables to inspect how well classification has worked.
print(table(iris['KMeans'], iris['Name']))
print(table(iris['Ward'], iris['Name']))


   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa
             SepalLength  SepalWidth  PetalLength  PetalWidth
SepalLength         1.00       -0.11         0.87        0.82
SepalWidth         -0.11        1.00        -0.42       -0.36
PetalLength         0.87       -0.42         1.00        0.96
PetalWidth          0.82       -0.36         0.96        1.00
                          Comp 1    Comp 2     Comp 3      Comp 4
Standard Deviation       1.70611  0.959803   0.383866    0.143554
Proportion of Variance  0.727705  0.230305  0.0368383  0.00515193
Cumulative Proportion   0.727705   0.95801   0.994848           1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


  Iris-setosa Iris-versicolor Iris-virginica
0           0              48             14
1          50               0              0
2           0               2             36
  Iris-setosa Iris-versicolor Iris-virginica
1          50               0              0
2           0               1             35
3           0              49             15
