In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import altair as alt

In [2]:
file_url = r'https://raw.githubusercontent.com/sedeba19/Chapter-5/main/data/DataSet_taxstats2015.csv'

In [4]:
df = pd.read_csv(file_url, 
                 usecols=['Postcode', 
                          'Average total business income', 
                          'Average total business expenses'])

In [5]:
X = df[['Average total business income', 
        'Average total business expenses']]

In [7]:
business_income_min = df['Average total business income'].min()
business_income_max = df['Average total business income'].max()

business_expenses_min = df['Average total business expenses'].min()
business_expenses_max = df['Average total business expenses'].max()

In [8]:
print(business_income_min)
print(business_income_max)
print(business_expenses_min)
print(business_expenses_max)

0
876324
0
884659


In [9]:
import random
random.seed(42)

In [10]:
centroids = pd.DataFrame()

In [11]:
centroids['Average total business income'] = random.sample(range(business_income_min, 
                                                                 business_income_max), 4)

In [12]:
centroids['Average total business expenses'] = random.sample(range(business_expenses_min, 
                                                                   business_expenses_max), 4)

In [13]:
centroids['cluster'] = centroids.index
centroids

Unnamed: 0,Average total business income,Average total business expenses,cluster
0,670487,288389,0
1,116739,256787,1
2,26225,234053,2
3,777572,146316,3


In [15]:
# Ceeate a scatter plot of the data points
chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', 
                                                   y='Average total business expenses', 
                                                   color=alt.value('orange'), 
                                                   tooltip=['Postcode', 
                                                            'Average total business income', 
                                                            'Average total business expenses']).interactive()
chart1

In [17]:
chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', 
                                                           y='Average total business expenses', 
                                                           color=alt.value('black'), 
                                                           tooltip=['cluster', 
                                                                    'Average total business income',
                                                                    'Average total business expenses']).interactive()
chart2

In [19]:
chart1 + chart2

In [20]:
def squared_euclidean(data_x, data_y, 
                      centroid_x, centroid_y, ):
    return (data_x - centroid_x)**2 + (data_y - centroid_y)**2

In [21]:
data_x = df.at[0, 'Average total business income']
data_y = df.at[0, 'Average total business expenses']

In [22]:
distances = [squared_euclidean\
             (data_x, data_y, centroids.at\
              [i, 'Average total business income'], 
              centroids.at[i, \
              'Average total business expenses']) 
              for i in range(4)]
distances

[215601466600, 10063365460, 34245932020, 326873037866]

In [24]:
cluster_index = distances.index(min(distances))
cluster_index

1

In [25]:
df.at[0, 'cluster'] = cluster_index

In [26]:
df.head()

Unnamed: 0,Postcode,Average total business income,Average total business expenses,cluster
0,2000,210901,222191,1.0
1,2006,69983,48971,
2,2007,575099,639499,
3,2008,53329,32173,
4,2009,237539,222993,


In [27]:
distances = [squared_euclidean\
             (df.at[1, 'Average total business income'], 
              df.at[1, 'Average total business expenses'], 
              centroids.at[i, 'Average total business income'],
              centroids.at[i, 
                           'Average total business expenses'])
             for i in range(4)]
df.at[1, 'cluster'] = distances.index(min(distances))

distances = [squared_euclidean
             (df.at[2, 'Average total business income'], 
              df.at[2, 'Average total business expenses'],
              centroids.at[i, 'Average total business income'],
              centroids.at[i, 
                           'Average total business expenses'])
             for i in range(4)]
df.at[2, 'cluster'] = distances.index(min(distances))

distances = [squared_euclidean\
             (df.at[3, 'Average total business income'], 
              df.at[3, 'Average total business expenses'], 
              centroids.at[i, 'Average total business income'],
              centroids.at[i, 
                           'Average total business expenses'])
             for i in range(4)]
df.at[3, 'cluster'] = distances.index(min(distances))

distances = [squared_euclidean\
             (df.at[4, 'Average total business income'], 
              df.at[4, 'Average total business expenses'], 
              centroids.at[i, 
              'Average total business income'], 
              centroids.at[i, 
              'Average total business expenses']) 
             for i in range(4)]
df.at[4, 'cluster'] = distances.index(min(distances))

df.head()

Unnamed: 0,Postcode,Average total business income,Average total business expenses,cluster
0,2000,210901,222191,1.0
1,2006,69983,48971,2.0
2,2007,575099,639499,0.0
3,2008,53329,32173,2.0
4,2009,237539,222993,1.0
