# PART - 1(K-Means Clustering)

### Task - A

In [None]:
## importing necessary libraries
import numpy as np
import random
import cv2
import matplotlib.pyplot as plt

In [None]:
## importing the image
img = cv2.imread("/content/test.png")
rgb_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

In [None]:
height, width, channels = rgb_image.shape
## extracting the pixels from the image
rgb_data = []

for y in range(height):
  for x in range(width):

      r, g, b = rgb_image[y, x]
      rgb_data.append([r, g, b])

In [None]:
len(rgb_data)  ## total no. of pixels

In [None]:
rgb_array = np.array(rgb_data)  ## converting to array

In [None]:
## function to compute the centroid of the data
def computeCentroid(X):
  mean = np.mean(X, axis=0)
  return mean

In [None]:
computeCentroid(rgb_array)  ## printing the centroid of the original data

### Task - B

In [None]:
## function to group the clusters
def group_clusters(X, centroids):
  dis = []
  dis = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)  ## calculating the distances
  groups = np.argmin(dis, axis=1)
  return groups

## functions to assign the new centroids
def new_centroids(X,cluster_group):
    new_cens = []
    cluster = np.unique(cluster_group)  ## extracting the unique clusters
    for type in cluster:
        new_cens.append(X[cluster_group == type].mean(axis=0))

    return np.array(new_cens)

def mykmeans(X, no_of_clusters = 3, epochs = 1000):
  centroids = X[random.sample(range(0,X.shape[0]),no_of_clusters)]  ## taking the random centroids
  cluster_group = None
  ## iterating in epochs
  for i in range(epochs):
        cluster_group = group_clusters(X, centroids)
        old_centroids = centroids

        # calculating the new centroid points
        centroids = new_centroids(X,cluster_group)
        if (old_centroids == centroids).all():
            print('Run Completed! at epoch : ', i)
            break

  return cluster_group,centroids ## returning the cluster_group and centroids

In [None]:
y_pred,cluster_centroids = mykmeans(rgb_array,5)  ## running model for k = 5

In [None]:
cluster_centroids

In [None]:
rgb_data = rgb_array.copy()

### Task - C

In [None]:
from PIL import Image

In [None]:
for i in np.unique(y_pred):
    rgb_data[y_pred == i] = cluster_centroids[i]

In [None]:
def show_image(image_array):
  img_compressed = image_array.reshape(height, width , channels)
  # creating a figure with two subplots
  fig, axes = plt.subplots(1, 2, figsize=(10, 5))

  # displaying the first image on the left subplot
  axes[0].imshow(rgb_image)
  axes[0].set_title('Original Image')

  # displaying the second image on the right subplot
  axes[1].imshow(img_compressed)
  axes[1].set_title('Compressed')

  # hiding the axes
  for ax in axes:
      ax.axis('off')

  # showing the plot
  plt.show()

In [None]:
show_image(rgb_data)  ## displaying the business

In [None]:
## running k-means for different value of k
k_vals = np.arange(1, 10, 2)
for i in k_vals:
    y_pred,cluster_centroids = mykmeans(rgb_array,i,250)
    rgb_data = rgb_array.copy()
    for j in np.unique(y_pred):
        rgb_data[y_pred == j] = cluster_centroids[j]
    print(f'value of k is {i}')
    show_image(rgb_data)

In [None]:
## function to calculate the WCSS
def WCSS(rgb_array, centroids, labels_):
   wcss = 0
   for centroid_idx, centroid in enumerate(centroids):
       cluster_samples = rgb_array[labels_ == centroid_idx]
       cluster_wcss = np.sum((cluster_samples - centroid) ** 2)
       wcss += cluster_wcss
   return wcss

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

k_values = range(2, 10)
wcss_scores = []
# Performing the elbow method for the K-Means
for k in k_values:
    y_pred,cluster_centroids = mykmeans(rgb_array,k)
    wcss = WCSS(rgb_array, cluster_centroids, y_pred)
    wcss_scores.append(wcss)

# Plotting WCSS scores
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(k_values, wcss_scores, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.title('Elbow Method')

plt.tight_layout()
plt.grid(True)
plt.show()

In [None]:
## from the graph we can see that "k = 6" should be the better choice

In [None]:
## running the k-means for k = 6
y_pred,cluster_centroids = mykmeans(rgb_array,6)
kmeans_centroids = cluster_centroids
kmeans_y_pred  = y_pred

In [None]:
cluster_centroids

In [None]:
## calculate the percentage of colors present in compressed image
percent=[]
labels = list(kmeans_y_pred)
for i in range(len(kmeans_centroids)):
  j=labels.count(i)
  j=j/(len(kmeans_y_pred))
  percent.append(j)
print(percent)

In [None]:
## plotting the pie chart
plt.pie(percent,colors=np.array(kmeans_centroids/255),labels=np.arange(len(kmeans_centroids)), autopct='%1.1f%%', startangle=140)
plt.title("Dominant Colors in Compressed Image using KMeans")
plt.show()

### Task - D(Implementing Using SKlearn)

In [None]:
from sklearn.cluster import KMeans  ## importing the KMeans

In [None]:
lst = list(np.arange(1,10, 2))

In [None]:
## running the k-means for sklearn
k_vals = np.arange(1, 10, 2)
md = []
for i in k_vals:
    kmeans = KMeans(n_clusters = i, n_init="auto")
    s = kmeans.fit(rgb_array)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    md.append(kmeans.inertia_)
    rgb_data = rgb_array.copy()
    for j in np.unique(labels):
        rgb_data[labels == j] = centroids[j]
    print(f'value of k is {i}')
    show_image(rgb_data)

In [None]:
## plotting the inertia vs no_of_clusters
plt.plot(lst ,md)
plt.xlabel("No of Clusters")
plt.ylabel("Inertia")
plt.title("Inertia vs No of Clusters")
plt.grid(True)
plt.show()

In [None]:
## from sklearn also we can see that "k = 6" , would be better choice

In [None]:
## plotting the pie chart of dominant colors using sklearn
kmeans=KMeans(n_clusters=6, n_init="auto")
kmeans.fit(rgb_array)
labels=kmeans.labels_
centroid=kmeans.cluster_centers_
labels=list(labels)
percent=[]
for i in range(len(centroid)):
  j=labels.count(i)
  j=j/(len(labels))
  percent.append(j)
plt.pie(percent,colors=np.array(centroid/255),labels=np.arange(len(centroid)), autopct='%1.1f%%', startangle=140)
plt.title("Dominant Colors in the Compressed Image Using Sklearn")
plt.show()

In [None]:
kmeans.cluster_centers_

### Task - E

In [None]:
from PIL import Image
import numpy as np
import random

"""
Code to extract the pixels along with the spatial coordinates
"""
image_path = 'test.png'
image = Image.open(image_path)

image = image.convert('RGB')

width, height = image.size

rgb_values = []
for y in range(height):
    for x in range(width):
        r, g, b = image.getpixel((x, y))
        rgb_values.append((r, g, b, y, x))
rgb_array_spatial = np.array(rgb_values)
print(rgb_array_spatial.shape)



def assign_clusters_sp_coh(X, centroids, spatial_weight=0.5):
    group = []
    # calculate the RGB distances
    rgb_distances = np.linalg.norm(X[:, np.newaxis, :3] - centroids[:, :3], axis=2)
    # calculate spatial distances using the last two columns (coordinates)
    spatial_distances = np.linalg.norm(X[:, np.newaxis, 3:] - centroids[:, 3:], axis=2)
    # combine RGB and spatial distances with the specified weight
    combined_distances = rgb_distances + spatial_weight * spatial_distances
    # assigning clusters based on the minimum combined distance
    group = np.argmin(combined_distances, axis=1)

    return group

def move_centroids_sp_coh(X, cluster_group):
    new_centroids = []

    cluster_type = np.unique(cluster_group)

    for cluster_idx in cluster_type:
        new_centroids.append(X[cluster_group == cluster_idx].mean(axis=0))

    return np.array(new_centroids)

def mykmeans_sp_coh(X, n_clusters, spatial_weight=0.5):
    max_itr = 500
    centroids = X[random.sample(range(0, X.shape[0]), n_clusters)]

    for i in range(max_itr):
        # assigning clusters
        cluster_group = assign_clusters_sp_coh(X, centroids, spatial_weight)
        old_centroids = centroids

        # moving centroids
        centroids = move_centroids_sp_coh(X, cluster_group)

        # checking convergence
        if (old_centroids == centroids).all():
            break

    return cluster_group, centroids

In [None]:
labels, group = mykmeans_sp_coh(rgb_array_spatial, 5)

In [None]:
rgb_data_2 = rgb_array_spatial[:,:3].copy()
for j in np.unique(labels):
    rgb_data_2[labels == j] = group[:,:3][j]
show_image(rgb_data_2)

In [None]:
def show_image_coherence(image_array0, image_array1, image_array2):
  img_compressed0 = image_array0.reshape(height, width , channels)
  img_compressed1 = image_array1.reshape(height, width , channels)
  img_compressed2 = image_array2.reshape(height, width , channels)
  # Create a figure with two subplots
  fig, axes = plt.subplots(1, 3,  figsize=(10, 5))

  # Display the first image on the left subplot
  axes[1].imshow(img_compressed1)
  axes[1].set_title('KMeans Compressed')

  # Display the second image on the right subplot
  axes[2].imshow(img_compressed2)
  axes[2].set_title('KMeans with Coherence Compressed')

  axes[0].imshow(img_compressed0)
  axes[0].set_title('Original Image')
  # Hide the axes
  for ax in axes:
      ax.axis('off')

  # Show the plot
  plt.show()

In [None]:
k_vals = np.arange(2, 10, 2)
md = []
for i in k_vals:
    kmeans = KMeans(n_clusters = i, n_init="auto")
    s = kmeans.fit(rgb_array)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    md.append(kmeans.inertia_)
    rgb_data = rgb_array.copy()

    labels1, group = mykmeans_sp_coh(rgb_array_spatial, i)
    rgb_data_2 = rgb_array_spatial[:,:3].copy()
    for j in np.unique(labels1):
        rgb_data_2[labels1 == j] = group[:,:3][j]
    # show_image(rgb_data_2)

    for j in np.unique(labels):
        rgb_data[labels == j] = centroids[j]
    print(f'value of k is {i}')
    show_image_coherence(rgb_array, rgb_data, rgb_data_2)

# PART - 2 (SUPPORT VECTOR MACHINE)

### Task - 1(a)

In [None]:
## importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
## loading the datasets
from sklearn import datasets
iris = datasets.load_iris(as_frame=True)

In [None]:
## extracting the data out from iris
X = iris.data
y = iris.target

In [None]:
iris.target_names  ## printing the target names

In [None]:
print("Shape of y : ",y.shape) ## printing the shape of y

In [None]:
print("Shape of X : ",X.shape) ## printing the shape of X

In [None]:
## converting X and y to array
y = np.array(y)
X = np.array(X)
y = y.reshape(-1,1)  ## reshaping the y

In [None]:
data = np.concatenate((X,y), axis=1)  ## concatenating X and y

In [None]:
df = pd.DataFrame(data, columns=["sepal length (cm)", "sepal width (cm)","petal length (cm)", "petal width (cm)", "target"])  ## creating a dataframe

In [None]:
df.info() ## printing the info about datasets

In [None]:
new_df1 = df[df["target"] == 0.00]
new_df2 = df[df["target"] == 1.00]
new_df = pd.concat((new_df1, new_df2))

In [None]:
new_df.head() ## new_df after merging the two above

In [None]:
final_df = new_df[["petal length (cm)", "petal width (cm)", "target"]]  ## final dataset after selecting only two features

In [None]:
final_df.head()

In [None]:
X = final_df.drop(columns=["target"]).values
y = final_df["target"].values

In [None]:
from sklearn.preprocessing import StandardScaler
## importing standard_scaler to scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
## splitting the data in the ratio of 80 : 20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Task - 1(b)

In [None]:
from sklearn.svm import LinearSVC  ## importing the linear SVM

In [None]:
## training the linear SVM
svc = LinearSVC()
svc.fit(X_train, y_train)

In [None]:
y_pred = svc.predict(X_test)  ## making predictions

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Accuracy : ",accuracy_score(y_pred, y_test))

In [None]:
## defining the parameter to plot the decision boundary
w = svc.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-2, 2)
yy = a * xx - (svc.intercept_[0])/w[1]

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
## Plotting the decision boundary for training and test data

## Training Data
plt.plot(xx, yy)
plt.scatter(X_train[:,0], X_train[:,1], c=y_train)
plt.xlabel("petal length (cm)")
plt.ylabel("petal width (cm)")
plt.title("Decision Boundary on Training Data")
plt.grid(True)
plt.show()

## Test Data
plt.plot(xx, yy)
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.xlabel("petal length (cm)")
plt.ylabel("petal width (cm)")
plt.title("Decision Boundary on Test Data")
plt.grid(True)
plt.show()

### Task - 2(a)

In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500, noise=0.05, random_state=42)  ## generating dataset using make_moons()

## adding some percentage of misclassifications points
num_noise_points = int(0.05 * len(X))
random_indices = np.random.choice(len(X), num_noise_points, replace=False)
y[random_indices] = 1 - y[random_indices] ## flipping the label points

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## plotting the dataset
plt.scatter(X[:,0], X[:,1], c=y)
plt.grid(True)
plt.title("Make Moons Dataset")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()

### Task - 2(b)

In [None]:
from sklearn.svm import SVC

In [None]:
## initializing teh different kernel models
svm_linear = SVC(kernel='linear', random_state=42)
svm_poly = SVC(kernel='poly', degree=5, random_state=42)
svm_rbf = SVC(kernel='rbf', gamma=0.9, random_state=42)

## fitting the different SVM models
svm_linear.fit(X, y)
svm_poly.fit(X, y)
svm_rbf.fit(X, y)

In [None]:
## plotting the decision boundary for different SVMs
def plot_decision_boundary(svm_model, title):
    plt.scatter(X[:, 0], X[:, 1], c=y)
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 100),
                         np.linspace(ylim[0], ylim[1], 100))
    Z = svm_model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.title(title)
    plt.show()


plot_decision_boundary(svm_linear, 'Linear Kernel Plot')
plot_decision_boundary(svm_poly, 'Polynomial Kernel Plot')
plot_decision_boundary(svm_rbf, 'RBF Kernel Plot')

### Task - 2(c)

In [None]:
## Performing the GridSearchCV and RandomizedSearchCV for the "RBF" kernels

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import reciprocal, uniform
## defining the parameters
c_lst = [0.1, 0.5, 0.9, 1, 5, 10, 20, 30, 50, 100]
g_lst = [0.001, 0.05, 0.07, 0.1, 0.5, 0.7, 0.9, 1]
param_grid = {'C': c_lst, 'gamma': g_lst}
## initializing the SVM using "rbf"
svm_rbf_kernel = SVC(kernel='rbf')

## performing the GridSearchCV
grid_search = GridSearchCV(svm_rbf_kernel, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

## printing the best parameters
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

## evaluating the model accuracy
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set accuracy:", test_score)

In [None]:
## defining the parameters using reciprocal distribution
param_dist = {
    'C': reciprocal(0.1, 100),
    'gamma': reciprocal(0.001, 1)
}

# initializing the SVM with RBF kernel
svm = SVC(kernel='rbf')

# performing the random search
random_search = RandomizedSearchCV(svm, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# printing the best parameters
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# evaluating the model and printing accuracy
best_model = random_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set accuracy:", test_score)

### Task - 2(d)

In [None]:
## best parameters from the GridSearchCV
best_params = {'C': 0.5, 'gamma': 0.9}

# training the svm for best parameters
best_svm = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
best_svm.fit(X_train, y_train)

# Plotting decision boundary
def plot_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolors='k')

plt.figure(figsize=(8, 4))
plot_decision_boundary(best_svm, X_train, y_train)
plt.xlabel('Petal length(cm)')
plt.ylabel('Petal width(cm)')
plt.title('Plot Using Best Parameters')
plt.show()

In [None]:
## best parameters from the RandomSearchCV
best_params = {'C': 1.3292918943162166, 'gamma': 0.711447600934342}

# training the svm for best parameters
best_svm = SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
best_svm.fit(X_train, y_train)

# Plotting decision boundary
def plot_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolors='k')

plt.figure(figsize=(8, 4))
plot_decision_boundary(best_svm, X_train, y_train)
plt.xlabel('Petal length (cm)')
plt.ylabel('Petal width (cm)')
plt.title('Plot Using Best Parameters')
plt.show()