<a href="https://colab.research.google.com/github/sadidoll/Machine-Learning/blob/main/Homework4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math
import random
import time
import numpy as np
from scipy.spatial import distance
from tkinter import *

######################################################################
# This section contains functions for loading CSV (comma separated values)
# files and convert them to a dataset of instances.
# Each instance is a tuple of attributes. The entire dataset is a list
# of tuples.
######################################################################

# Loads a CSV files into a list of tuples.
# Ignores the first row of the file (header).
# Numeric attributes are converted to floats, nominal attributes
# are represented with strings.
# Parameters:
#   fileName: name of the CSV file to be read
# Returns: a list of tuples
def loadCSV(fileName):
    fileHandler = open(fileName, "rt")
    lines = fileHandler.readlines()
    fileHandler.close()
    del lines[0] # remove the header
    dataset = []
    for line in lines:
        instance = lineToTuple(line)
        dataset.append(instance)
    return dataset

# Converts a comma separated string into a tuple
# Parameters
#   line: a string
# Returns: a tuple
def lineToTuple(line):
    # remove leading/trailing witespace and newlines
    cleanLine = line.strip()
    # get rid of quotes
    cleanLine = cleanLine.replace('"', '')
    # separate the fields
    lineList = cleanLine.split(",")
    # convert strings into numbers
    stringsToNumbers(lineList)
    lineTuple = tuple(lineList)
    return lineTuple

# Destructively converts all the string elements representing numbers
# to floating point numbers.
# Parameters:
#   myList: a list of strings
# Returns None
def stringsToNumbers(myList):
    for i in range(len(myList)):
        if (isValidNumberString(myList[i])):
            myList[i] = float(myList[i])

# Checks if a given string can be safely converted into a positive float.
# Parameters:
#   s: the string to be checked
# Returns: True if the string represents a positive float, False otherwise
def isValidNumberString(s):
  if len(s) == 0:
    return False
  if  len(s) > 1 and s[0] == "-":
      s = s[1:]
  for c in s:
    if c not in "0123456789.":
      return False
  return True

######################################################################
# This section contains functions for clustering a dataset
# using the k-means algorithm.
######################################################################

def distance(instance1, instance2, distance_type = 'Euclidean'):
  if distance_type == 'Euclidean':
    if instance1 == None or instance2 == None:
        return float("inf")
    sumOfSquares = 0
    for i in range(1, len(instance1)):
        sumOfSquares += (instance1[i] - instance2[i])**2
    return sumOfSquares
  if distance_type == 'Manhattan':
    #print('In manhattan loop')
    if instance1 == None or instance2 == None:
        return float("inf")
    return abs(instance1[1] - instance2[1]) + abs(instance1[2] - instance2[2])
  if distance_type == 'Cosine':
    if instance1 == None or instance2 == None:
        return float("inf")
    point1 = instance1[-2:]
    point2 = instance2[-2:] 
    return (dot(point1,point2)/cross(point1,point2)) 
  if distance_type == 'Jaccard':
    if instance1 == None or instance2 == None:
        return float("inf")
    distance_j = distance.jaccard (instance1,instance2)
    return distance_j 
  if distance_type == None:
    return float("inf") 

def meanInstance(name, instanceList):
    numInstances = len(instanceList)
    if (numInstances == 0):
        return
    numAttributes = len(instanceList[0])
    means = [name] + [0] * (numAttributes-1)
    for instance in instanceList:
        for i in range(1, numAttributes):
            means[i] += instance[i]
    for i in range(1, numAttributes):
        means[i] /= float(numInstances)
    return tuple(means)

def assign(instance, centroids, distance_type = 'Euclidean' ):
    minDistance = distance(instance, centroids[0],distance_type)
    minDistanceIndex = 0
    for i in range(1, len(centroids)):
        d = distance(instance, centroids[i], distance_type)
        if (d < minDistance):
            minDistance = d
            minDistanceIndex = i
    return minDistanceIndex

def createEmptyListOfLists(numSubLists):
    myList = []
    for i in range(numSubLists):
        myList.append([])
    return myList

def assignAll(instances, centroids, distance_type = 'Euclidean'):
    clusters = createEmptyListOfLists(len(centroids))
    for instance in instances:
        clusterIndex = assign(instance, centroids, distance_type)
        clusters[clusterIndex].append(instance)
    return clusters

def computeCentroids(clusters):
    centroids = []
    for i in range(len(clusters)):
        name = "centroid" + str(i)
        centroid = meanInstance(name, clusters[i])
        centroids.append(centroid)
    return centroids

def kmeans(instances, k, initCentroids=None, distance_type = 'Euclidean'):
    result = {}
    if (initCentroids == None or len(initCentroids) < k):
        # randomly select k initial centroids
        random.seed(time.time())
        centroids = random.sample(instances, k)
    else:
        centroids = initCentroids
    prevCentroids = []
#    if animation:
#        delay = 1.0 # seconds
#        canvas = prepareWindow(instances)
 #       clusters = createEmptyListOfLists(k)
  #      clusters[0] = instances
 #       paintClusters2D(canvas, clusters, centroids, "Initial centroids")
 #       time.sleep(delay)
    iteration = 0
    while (centroids != prevCentroids):
        iteration += 1
        clusters = assignAll(instances, centroids, distance_type)
 #       if animation:
  #          paintClusters2D(canvas, clusters, centroids, "Assign %d" % iteration)
  #          time.sleep(delay)
        prevCentroids = centroids
        centroids = computeCentroids(clusters)
        withinss = computeWithinss(clusters, centroids, distance_type)
#        if animation:
  #          paintClusters2D(canvas, clusters, centroids,
  #                          "Update %d, withinss %.1f" % (iteration, withinss))
   #         time.sleep(delay)
    result["clusters"] = clusters
    result["centroids"] = centroids
    result["withinss"] = withinss
    return result

def computeWithinss(clusters, centroids, distance_type = 'Euclidean'):
    result = 0
    for i in range(len(centroids)):
        centroid = centroids[i]
        cluster = clusters[i]
        for instance in cluster:
            result += distance(centroid, instance, distance_type)
    return result

# Repeats k-means clustering n times, and returns the clustering
# with the smallest withinss
def repeatedKMeans(instances, k, n, initCentroids=None,distance_type = 'Euclidean'):
    bestClustering = {}
    bestClustering["withinss"] = float("inf")
    for i in range(1, n+1):
        print ("k-means trial %d," % i ),
        trialClustering = kmeans(instances, k, initCentroids,distance_type)
        print ("withinss: %.1f" % trialClustering["withinss"])
        if trialClustering["withinss"] < bestClustering["withinss"]:
            bestClustering = trialClustering
            minWithinssTrial = i
    print ("Trial with minimum withinss:", minWithinssTrial)
    return bestClustering




In [None]:
from google.colab import files
uploaded = files.upload()

Saving Team.csv to Team (5).csv


In [None]:
dataset = loadCSV("Team.csv")

##Task 1 : (1) Initializing two centroids, (4, 6) and (5, 4) using Manhattan distance as the distance metric along with performing one iteration of the K-means algorithm and the reporting the coordinates of the resulting centroids.Also, Using K-Means to find two clusters:

In [None]:

centroids = [('centroidA',4,6),('centroidB',5,4)]
cluster_1 = kmeans(dataset, 2, initCentroids = centroids, distance_type = 'Manhattan')

#print("The two clusters using k-means to find clusters: ")
cluster_2 = repeatedKMeans(dataset, 2, 100,centroids, distance_type = 'Manhattan')
printTable(cluster_1['centroids'])
cluster_1["clusters"]
cluster_2["clusters"]
#clusters_1["clusters"]

k-means trial 1,
withinss: 27.0
k-means trial 2,
withinss: 27.0
k-means trial 3,
withinss: 27.0
k-means trial 4,
withinss: 27.0
k-means trial 5,
withinss: 27.0
k-means trial 6,
withinss: 27.0
k-means trial 7,
withinss: 27.0
k-means trial 8,
withinss: 27.0
k-means trial 9,
withinss: 27.0
k-means trial 10,
withinss: 27.0
k-means trial 11,
withinss: 27.0
k-means trial 12,
withinss: 27.0
k-means trial 13,
withinss: 27.0
k-means trial 14,
withinss: 27.0
k-means trial 15,
withinss: 27.0
k-means trial 16,
withinss: 27.0
k-means trial 17,
withinss: 27.0
k-means trial 18,
withinss: 27.0
k-means trial 19,
withinss: 27.0
k-means trial 20,
withinss: 27.0
k-means trial 21,
withinss: 27.0
k-means trial 22,
withinss: 27.0
k-means trial 23,
withinss: 27.0
k-means trial 24,
withinss: 27.0
k-means trial 25,
withinss: 27.0
k-means trial 26,
withinss: 27.0
k-means trial 27,
withinss: 27.0
k-means trial 28,
withinss: 27.0
k-means trial 29,
withinss: 27.0
k-means trial 30,
withinss: 27.0
k-means trial 31,
w

[[('X1', 3.0, 5.0), ('X3', 2.0, 8.0), ('X10', 7.0, 6.0)],
 [('X2', 3.0, 4.0),
  ('X4', 2.0, 3.0),
  ('X5', 6.0, 2.0),
  ('X6', 6.0, 4.0),
  ('X7', 7.0, 3.0),
  ('X8', 7.0, 4.0),
  ('X9', 8.0, 5.0)]]

##(2) Initializing two centroids, (4, 6) and (5, 4) using Euclidean distance as the distance metric along with performing one iteration of the K-means algorithm and the reporting the coordinates of the resulting centroids.Also, Using K-Means to find two clusters:

In [None]:

centroids_2 = [('centroidA',4,6),('centroidB',5,4)]
cluster_12 = kmeans(dataset, 2, initCentroids = centroids_2, distance_type = 'Euclidean')

#print("The two clusters using k-means to find clusters: ")
cluster_22 = repeatedKMeans(dataset, 2, 100,centroids_2, distance_type = 'Euclidean')
printTable(cluster_12['centroids'])
cluster_12["clusters"]
cluster_22["clusters"]
#clusters_1["clusters"]


k-means trial 1,
withinss: 27.8
k-means trial 2,
withinss: 27.8
k-means trial 3,
withinss: 27.8
k-means trial 4,
withinss: 27.8
k-means trial 5,
withinss: 27.8
k-means trial 6,
withinss: 27.8
k-means trial 7,
withinss: 27.8
k-means trial 8,
withinss: 27.8
k-means trial 9,
withinss: 27.8
k-means trial 10,
withinss: 27.8
k-means trial 11,
withinss: 27.8
k-means trial 12,
withinss: 27.8
k-means trial 13,
withinss: 27.8
k-means trial 14,
withinss: 27.8
k-means trial 15,
withinss: 27.8
k-means trial 16,
withinss: 27.8
k-means trial 17,
withinss: 27.8
k-means trial 18,
withinss: 27.8
k-means trial 19,
withinss: 27.8
k-means trial 20,
withinss: 27.8
k-means trial 21,
withinss: 27.8
k-means trial 22,
withinss: 27.8
k-means trial 23,
withinss: 27.8
k-means trial 24,
withinss: 27.8
k-means trial 25,
withinss: 27.8
k-means trial 26,
withinss: 27.8
k-means trial 27,
withinss: 27.8
k-means trial 28,
withinss: 27.8
k-means trial 29,
withinss: 27.8
k-means trial 30,
withinss: 27.8
k-means trial 31,
w

[[('X1', 3.0, 5.0), ('X2', 3.0, 4.0), ('X3', 2.0, 8.0), ('X4', 2.0, 3.0)],
 [('X5', 6.0, 2.0),
  ('X6', 6.0, 4.0),
  ('X7', 7.0, 3.0),
  ('X8', 7.0, 4.0),
  ('X9', 8.0, 5.0),
  ('X10', 7.0, 6.0)]]

##(3) Initializing two centroids, (3, 3) and (8, 3) using Manhattan distance as the distance metric along with performing one iteration of the K-means algorithm and the reporting the coordinates of the resulting centroids.Also, Using K-Means to find two clusters:

In [None]:

centroids_3 = [('centroidA',3,3),('centroidB',8,3)]
cluster_13 = kmeans(dataset, 2, initCentroids = centroids_3, distance_type = 'Manhattan')

#print("The two clusters using k-means to find clusters: ")
cluster_23 = repeatedKMeans(dataset, 2, 100,centroids_3,distance_type = 'Manhattan')
printTable(cluster_13['centroids'])
cluster_13["clusters"]
cluster_23["clusters"]
#clusters_1["clusters"]


k-means trial 1,
withinss: 17.3
k-means trial 2,
withinss: 17.3
k-means trial 3,
withinss: 17.3
k-means trial 4,
withinss: 17.3
k-means trial 5,
withinss: 17.3
k-means trial 6,
withinss: 17.3
k-means trial 7,
withinss: 17.3
k-means trial 8,
withinss: 17.3
k-means trial 9,
withinss: 17.3
k-means trial 10,
withinss: 17.3
k-means trial 11,
withinss: 17.3
k-means trial 12,
withinss: 17.3
k-means trial 13,
withinss: 17.3
k-means trial 14,
withinss: 17.3
k-means trial 15,
withinss: 17.3
k-means trial 16,
withinss: 17.3
k-means trial 17,
withinss: 17.3
k-means trial 18,
withinss: 17.3
k-means trial 19,
withinss: 17.3
k-means trial 20,
withinss: 17.3
k-means trial 21,
withinss: 17.3
k-means trial 22,
withinss: 17.3
k-means trial 23,
withinss: 17.3
k-means trial 24,
withinss: 17.3
k-means trial 25,
withinss: 17.3
k-means trial 26,
withinss: 17.3
k-means trial 27,
withinss: 17.3
k-means trial 28,
withinss: 17.3
k-means trial 29,
withinss: 17.3
k-means trial 30,
withinss: 17.3
k-means trial 31,
w

[[('X1', 3.0, 5.0), ('X2', 3.0, 4.0), ('X3', 2.0, 8.0), ('X4', 2.0, 3.0)],
 [('X5', 6.0, 2.0),
  ('X6', 6.0, 4.0),
  ('X7', 7.0, 3.0),
  ('X8', 7.0, 4.0),
  ('X9', 8.0, 5.0),
  ('X10', 7.0, 6.0)]]

##(4) Initializing two centroids, (3, 2) and (4, 8) using Manhattan distance as the distance metric along with performing one iteration of the K-means algorithm and the reporting the coordinates of the resulting centroids.Also, Using K-Means to find two clusters:

In [None]:
centroids_4 = [('centroidA',3,2),('centroidB',4,8)]
cluster_14 = kmeans(dataset, 2, initCentroids = centroids_4, distance_type = 'Manhattan')

#print("The two clusters using k-means to find clusters: ")
cluster_24 = repeatedKMeans(dataset, 2, 100,centroids_4, distance_type = 'Manhattan')
printTable(cluster_14['centroids'])
cluster_14["clusters"]
cluster_24["clusters"]


k-means trial 1,
withinss: 29.2
k-means trial 2,
withinss: 29.2
k-means trial 3,
withinss: 29.2
k-means trial 4,
withinss: 29.2
k-means trial 5,
withinss: 29.2
k-means trial 6,
withinss: 29.2
k-means trial 7,
withinss: 29.2
k-means trial 8,
withinss: 29.2
k-means trial 9,
withinss: 29.2
k-means trial 10,
withinss: 29.2
k-means trial 11,
withinss: 29.2
k-means trial 12,
withinss: 29.2
k-means trial 13,
withinss: 29.2
k-means trial 14,
withinss: 29.2
k-means trial 15,
withinss: 29.2
k-means trial 16,
withinss: 29.2
k-means trial 17,
withinss: 29.2
k-means trial 18,
withinss: 29.2
k-means trial 19,
withinss: 29.2
k-means trial 20,
withinss: 29.2
k-means trial 21,
withinss: 29.2
k-means trial 22,
withinss: 29.2
k-means trial 23,
withinss: 29.2
k-means trial 24,
withinss: 29.2
k-means trial 25,
withinss: 29.2
k-means trial 26,
withinss: 29.2
k-means trial 27,
withinss: 29.2
k-means trial 28,
withinss: 29.2
k-means trial 29,
withinss: 29.2
k-means trial 30,
withinss: 29.2
k-means trial 31,
w

[[('X1', 3.0, 5.0),
  ('X2', 3.0, 4.0),
  ('X4', 2.0, 3.0),
  ('X5', 6.0, 2.0),
  ('X6', 6.0, 4.0),
  ('X7', 7.0, 3.0),
  ('X8', 7.0, 4.0)],
 [('X3', 2.0, 8.0), ('X9', 8.0, 5.0), ('X10', 7.0, 6.0)]]

#**Task 2**

##Q1: Run K-means clustering with Euclidean, Cosine and Jarcard similarity. Specify K= thenumber of categorical values of y (the variable of label). Compare the SSEs of Euclidean-Kmeans Cosine-K-means, Jarcard-K-means. Which method is better?

In [None]:
from sklearn import datasets
iris = datasets.load_iris()

In [None]:
iris

 'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

##Task 3

In [9]:
## Just testing this task 3 to check the manuall calculations

from itertools import combinations
import numpy as np
members = [(4.7,3.2),(4.9,3.1),(5.0,3.0),(4.6,2.9),(5.9,3.2),(6.7,3.1),(6.0,3.0),(6.2,2.8)]

combiningall = list(combinations(members,2))
#storing
distances = list()
for x in combiningall:
  p1 = np.array(x[0])
  p2 = np.array(x[1])
  euclidean_distance = round(np.linalg.norm(p1-p2),4)
  distances.append(euclidean_distance)

distances = np.array(distances)


##What is the distance between the two farthest members?

In [10]:
distances.max()

2.1095

## What is the distance between the two closest members?

In [11]:
distances.min()

0.1414

##What is the average distance between all pairs?

In [12]:
distances.mean()

0.9829749999999999

##The average distance : 0.982975
