In [1]:
## necessary imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## reading the dataset
df1 = pd.read_csv('iris.csv')

In [3]:
df1.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df1.drop('Id',axis=1,inplace=True)

In [5]:
df1.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
# ## encoding the Species feature column
# d = dict()
# d['Iris-setosa'] = 0
# d['Iris-virginica'] = 1
# d['Iris-versicolor'] = 2

# df1['Species'] = df1['Species'].map(d)

In [7]:
# df1.head()

In [8]:
X = df1.drop('Species',axis=1)
y = df1['Species']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=238, test_size=0.33)

In [10]:
## let's first train the model directly without using any dimensionality reduction technique
## here let's use the random forest classifier model for the training

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=250,n_jobs=-1,max_depth=3)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=250, n_jobs=-1)

In [11]:
rfc_predictions = rfc.predict(X_test)
print(rfc_predictions)

['Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'
 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'
 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-versicolor']


In [14]:
## checking the confusion matrix of the predictions
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, rfc_predictions)
print("Confusion Matrix for the model created before applying any kind of dimensionality reduction method:\n")
confusion_matrix(y_test,rfc_predictions, labels = df1['Species'].unique())

Confusion Matrix for the model created before applying any kind of dimensionality reduction method:



array([[16,  0,  0],
       [ 0, 14,  1],
       [ 0,  0, 19]], dtype=int64)

## Using the clustering as a dimesionality reduction method

**Once the dataset has been clustered, it is usually possible to measure each instance's affinity with each cluster (Affinity is  any measure of how well an instance fits into a cluster). Each instace's feature vector x can then be replaced with the vector  od its cluster affinities. If there are k clusters, then this vector is k-dimensional. This vector is typically much lower-dimensional than the original feature vector, but it can preserve enough information for further preprocessing.**

In [16]:
## let's create 3 clusters
from sklearn.cluster import KMeans
k = 3
kmeans = KMeans(n_clusters=k)
kmeans.fit(X_train)
X_train_new = kmeans.transform(X_train)

In [18]:
X_train_new

array([[3.35468965, 0.37538907, 4.76040299],
       [2.33018382, 5.62314329, 0.95309663],
       [0.60558515, 3.71828224, 1.61249258],
       [3.52594878, 0.69305752, 4.87515014],
       [0.81268339, 4.0849479 , 1.40144403],
       [1.61985884, 2.2842889 , 3.16367318],
       [3.17852363, 0.72459271, 4.52396057],
       [0.90205882, 4.03160739, 1.09033388],
       [0.65786008, 3.97333456, 0.97675246],
       [3.44963609, 0.10448423, 4.91870009],
       [3.40563669, 0.33392393, 4.90325111],
       [2.99953304, 6.02770851, 1.59664109],
       [3.66431419, 1.20331865, 5.21670829],
       [0.58088815, 2.96236778, 2.09225324],
       [3.35236652, 0.12961214, 4.80426913],
       [3.35991949, 0.2468136 , 4.8286421 ],
       [0.32988845, 3.5650723 , 1.35442983],
       [0.47347147, 3.39520066, 1.56083929],
       [0.81454132, 4.09939442, 0.99177144],
       [1.6810817 , 4.86733161, 0.62111553],
       [0.60211903, 2.84610764, 2.15651736],
       [1.07961594, 4.48504435, 0.60551966],
       [0.

In [19]:
## now using these values as a new training set
rfc_new = RandomForestClassifier(max_depth=3,n_jobs=-1,n_estimators=250)
rfc_new.fit(X_train_new,y_train)

RandomForestClassifier(max_depth=3, n_estimators=250, n_jobs=-1)

In [21]:
rfc_new_predictions = rfc_new.predict(kmeans.transform(X_test))

In [22]:
rfc_new_predictions

array(['Iris-setosa', 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor'],
      dtype=object)

In [25]:
## checking the confusion matrix of the predictions after replacing feature vectors for instances with the affinity vector
from sklearn.metrics import confusion_matrix

print("Confusion Matrix for the model created after replacing feature vectors for instances with the affinity vector:\n")
confusion_matrix(y_test,rfc_new_predictions, labels = df1['Species'].unique())

Confusion Matrix for the model created after replacing feature vectors for instances with the affinity vector:



array([[16,  0,  0],
       [ 0, 14,  1],
       [ 0,  2, 17]], dtype=int64)

**Here we can see that there are 2 more inaccurate predictions than before. This is due the fact that dimensionality reduction method losses some information. But still 2 inaccurate predictions is not bad**