In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
def preprocess():
  data = pd.read_csv('/content/drive/My Drive/ML_assignment_4/iris.data', sep=',',header=None)
  data[4] = data[4].astype('category').cat.codes
  label = data.iloc[:,4].copy()
  data.drop(4,axis=1,inplace=True)
  x_train,x_valid,y_train,y_valid = train_test_split(data,label, test_size=0.30, random_state=2)
  return x_train.to_numpy(),x_valid.to_numpy(),y_train.to_numpy(),y_valid.to_numpy()

In [4]:
def elbow_method(x_train):
  error=[]
  for i in range(1,10):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(x_train)
    error.append(kmeans.inertia_)
  x_axis=[i for i in range(1,10)]
  plt.plot(x_axis,error)
  plt.xlabel('Number of clusters')
  plt.ylabel('Sum Squared Error')
  #plt.savefig('/content/drive/My Drive/ML/plots/Q1_b.jpg')
  plt.show()

In [5]:
def train_model(x_train):
  model = KMeans(n_clusters=3,max_iter=1000).fit(x_train)
  joblib.dump(model,'/content/drive/My Drive/ML/models/optimal_kmeans.pkl')

def scatter_plot(x_train):
  model = joblib.load('/content/drive/My Drive/ML/models/optimal_kmeans.pkl')
  #pca = PCA(n_components=2).fit_transform(x_train)
  #joblib.dump(pca,'/content/drive/My Drive/ML/models/pca_kmeans.pkl')
  pca = joblib.load('/content/drive/My Drive/ML/models/pca_kmeans.pkl')
  for i in range(pca.shape[0]):
    if model.labels_[i]==0:
      cluster0 = plt.scatter(pca[i,0],pca[i,1],color='yellow')
    elif model.labels_[i]==1:
      cluster1 = plt.scatter(pca[i,0],pca[i,1],color='blue')
    elif model.labels_[i] == 2:
      cluster2 = plt.scatter(pca[i,0],pca[i,1],color='red') 
  plt.legend([cluster0, cluster1, cluster2],['Cluster 0', 'Cluster 1','Cluster 2'])
  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.savefig('/content/drive/My Drive/ML/plots/Q1_c.jpg')
  plt.show()

In [6]:
def accuracy_calculation(x_train,x_valid,y_train,y_valid):
  model = joblib.load('/content/drive/My Drive/ML/models/optimal_kmeans.pkl')
  cluster_label= model.labels_
  unique_clusters = np.unique(cluster_label)
  cluster_iris_label=[]
  for cluster in unique_clusters:
    count =[0,0,0]
    for i in range(len(cluster_label)):
      if cluster_label[i]==cluster:
        count[y_train[i]]+=1
    cluster_iris_label.append(np.array(count).argmax())
  
  y_pred_train = model.predict(x_train)
  y_pred_train1=[]
  for i in range(len(y_pred_train)):
    y_pred_train1.append(cluster_iris_label[y_pred_train[i]])
  train_accuracy = accuracy_score(y_pred_train1,y_train)
  print('Accuracy Train: ',train_accuracy*100)

  y_pred_valid = model.predict(x_valid)
  y_pred_valid1=[]
  for i in range(len(y_pred_valid)):
    y_pred_valid1.append(cluster_iris_label[y_pred_valid[i]])
  valid_accuracy = accuracy_score(y_pred_valid1,y_valid)
  print('Accuracy Validation: ',valid_accuracy*100)

In [None]:
x_train,x_valid,y_train,y_valid = preprocess()
# x_train.shape: (105,4) len(x_valid): 45 unique_labels: [0,1,2]
elbow_method(x_train)

In [None]:
#train_model(x_train)
scatter_plot(x_train)

In [None]:
accuracy_calculation(x_train,x_valid,y_train,y_valid)