# Method 2: Geodemographics

Now that we have synthetic granular data representing BSA results for the entire adult population of York, we can set about using this data to create clusters to represent voting habits in York.

### Step 1: Importing packages

In [None]:
import pandas as pd
import geopandas as gpd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import copy

### Step 2: Importing our CSV file and our shapefile, and merging them
In this step, we take our aggregate dataframe and our shapefile containing electoral wards in York, and we combine them into one geodataframe, which we then export as a shapefile. Doing this means that the following code only needs to be run once, therefore it is now hashtagged out as the merged file is included with the data package to reproduce this paper.

In [None]:
york_shapefile = gpd.read_file('data/York_boundaries/york_electoral_boundaries.shp')
york_BSA = pd.read_csv('data/BSA_agg.csv', low_memory=False)

In [None]:
BSA_geom = york_BSA.merge(york_shapefile, on='geo_code', how='left')
BSA_geom = gpd.GeoDataFrame(BSA_geom, geometry="geometry")

In [None]:
#BSA_geom.to_file('data/geodemographics shapefile/BSA_agg_geom.shp')

Therefore, when you want to run this code, you need only run the line below to import the merged file, saving on processing time.

In [None]:
#BSA_geom = gpd.read_file('data/geodemographics shapefile/BSA_agg_geom.shp')

In [None]:
BSA_geom.columns

In [None]:
BSA_geom.head()

### Step 3: Selecting our variables

Creating histograms to justify variable selection

In [None]:
attributes_to_plot = ['support_conservative',
                      'support_labour',
                      'support_libdem',
                      'support_green',
                      'support_ukip',
                      'support_brexit',
                      'support_other', 
                      'support_none'
                     ]

plt.figure(figsize=(12, 12))

for i, attribute in enumerate(attributes_to_plot,1):
    plt.subplot(4, 2, i)
    sns.histplot(BSA_geom[attribute], kde=True)
    plt.title(attribute)

plt.tight_layout()
plt.show()

In [None]:
attributes_to_plot = ['voted_conservative',
                      'voted_labour',
                      'voted_libdem',
                      'voted_green',
                      'voted_UKIP',
                      'voted_other',
                     ]

plt.figure(figsize=(12, 12))

for i, attribute in enumerate(attributes_to_plot,1):
    plt.subplot(4, 2, i)
    sns.histplot(BSA_geom[attribute], kde=True)
    plt.title(attribute)

plt.tight_layout()
plt.show()

### Step 4: Subsetting the data

In this step we create a new dataframe of only the variables we need, and transform the aggregate counts into percentages of the total population. As we know from our <a href="cleaning.ipynb#Step-4:-Validation-and-checking-the-data-to-ensure-we've-aggregated-it-correctly">cleaning</a> that within each category of data (age, sex, race, who they voted for), the entire population of York is represented across the columns representing that category. Therefore we calculate all percentages using the 'total_pop' column, and subset the results into a new dataframe just containing the relevant percentages.

After this, we rename the columns to make referring to particular columns easier later on.

In [None]:
def percentages(dataframe, value_columns): # function to calculate percentages

    result_df = pd.DataFrame() # initialises a new results dataframe

    for i in value_columns: # for loop to calculate percentages
        percentage_col_name = f"{i}_percentage"
        result_df[percentage_col_name] = (dataframe[i] / dataframe['total_pop']) * 100
        
    return result_df # returns the new dataframe

# List of the columns we're going to standardise
value_cols = ['age1864', 'age65',
              'female', 'male',
              'support_yes', 'support_no',
              'political_interest_yes', 'political_interest_no',
              'welfare_pro', 'welfare_anti',
              'wealth_redist_pro', 'wealth_redist_anti',
              'lean_left','lean_right', 'lean_centrist',
              'rel_christian', 'rel_nonChristian', 'rel_none', 'rel_other',
              'nat_british', 'nat_english', 'nat_european', 'nat_irish',
              'nat_scottish', 'nat_welsh', 'nat_other_none',
              'race_white', 'race_asian', 'race_mixed', 'race_other',
              'disab_affect', 'disab_no_affect', 'disab_none',
              'voted_conservative', 'voted_labour','voted_libdem',
              'voted_green', 'voted_UKIP', 'voted_other'
             ]


percentage_results = percentages(BSA_geom, value_cols)

In [None]:
percentage_results.shape

In [None]:
# join it to the merged dataframe!
BSA_geom_perc = pd.concat([BSA_geom, percentage_results], axis=1, ignore_index=False)
BSA_geom_perc.head()

In [None]:
BSA_geom_perc.columns

In [None]:
keep = ['geo_code',
        'geometry',
        'age1864_percentage',
        'age65_percentage',
        'female_percentage',
        'male_percentage',
        'political_interest_yes_percentage',
        'political_interest_no_percentage',
        'welfare_pro_percentage',
        'welfare_anti_percentage',
        'wealth_redist_pro_percentage',
        'wealth_redist_anti_percentage',
        'lean_left_percentage',
        'lean_right_percentage', 
        'lean_centrist_percentage',
        'rel_christian_percentage',
        'rel_nonChristian_percentage',
        'rel_none_percentage',
        'rel_other_percentage', 
        'nat_british_percentage', 
        'nat_english_percentage', 
        'nat_european_percentage', 
        'nat_irish_percentage',
        'nat_scottish_percentage', 
        'nat_welsh_percentage', 
        'nat_other_none_percentage',
        'race_white_percentage',
        'race_asian_percentage', 
        'race_mixed_percentage',
        'race_other_percentage', 
        'disab_affect_percentage',
        'disab_no_affect_percentage', 
        'disab_none_percentage',

       ]

bsa_perc_sub = BSA_geom_perc[keep]

In [None]:
bsa_perc_sub.head()

Renaming our subset of percentage columns for easy analysis, a lot of these names are quite long so we'll give them shorter names to make them easier to work with.

In [None]:
rename_columns = { 
        'age1864_percentage': 'adults18_64',
        'age65_percentage': 'adults65pl',
        'female_percentage': 'female',
        'male_percentage': 'male',
        'political_interest_yes_percentage': 'polInter_yes',
        'political_interest_no_percentage': 'polInter_no',
        'welfare_pro_percentage': 'welf_pro',
        'welfare_anti_percentage': 'welf_anti',
        'wealth_redist_pro_percentage': 'redist_pro',
        'wealth_redist_anti_percentage': 'redist_anti',
        'lean_left_percentage': 'lean_left',
        'lean_right_percentage': 'lean_right',
        'lean_centrist_percentage': 'lean_centr',
        'rel_christian_percentage': 'rel_christ',
        'rel_nonChristian_percentage': 'rel_nonchrist',
        'rel_none_percentage': 'rel_non',
        'rel_other_percentage': 'rel_other',
        'nat_british_percentage': 'nat_british',
        'nat_english_percentage': 'nat_english',
        'nat_european_percentage': 'nat_euro',
        'nat_irish_percentage': 'nat_irish',
        'nat_scottish_percentage': 'nat_scottish',
        'nat_welsh_percentage': 'nat_welsh',
        'nat_other_none_percentage': 'nat_other',
        'race_white_percentage': 'race_white',
        'race_asian_percentage': 'race_asian',
        'race_mixed_percentage': 'race_mixed',
        'race_other_percentage': 'race_other',
        'disab_affect_percentage': 'disab_aff',
        'disab_no_affect_percentage': 'disab_noAff',
        'disab_none_percentage': 'disab_None',

                 }

bsa_perc_sub = bsa_perc_sub.rename(columns=rename_columns)

In [None]:
bsa_perc_sub.head()

In [None]:
bsa_perc_sub.columns

### Step 5: Z-scores and measuring for association
To finish standardising variables, we next calculate z-scores for each column, saving these into a new dataframe ('bsa_z'). Then we use these to measure for association.

In [None]:
numeric_columns = bsa_perc_sub.select_dtypes(include='float64') # subsetting for just the numeric columns
bsa_z = (numeric_columns - numeric_columns.mean()) / numeric_columns.std(ddof=0)
bsa_z.head()

In [None]:
corr_colourmap = bsa_z.corr()
corr_colourmap.style.background_gradient(cmap='coolwarm')

In [None]:
threshold = 0.8

highly_correlated = (corr_colourmap.abs() > threshold) & (corr_colourmap.abs() < 1.0)

plt.figure(figsize=(10, 8))
sns.heatmap(highly_correlated, cmap='coolwarm', cbar=False, annot=True)

plt.title('Highly Correlated Variables')
plt.savefig("data/images/matrix1.png") # saving the output as it's too large to screenshot
plt.show()


As we can see from this graph, there are a lot of highly correlated variables. As discussed within the methods of the report, this can be expected with synthetic data, therefore we will just select the variables we want to include in our clustering regardless of correlations.

In [None]:
bsa_z.drop(['disab_None', 
            'disab_noAff', 
            #'race_other',
            'nat_euro', 
            'nat_irish', 
            'nat_scottish',
            'nat_welsh',
            'nat_other',
            'polInter_no',
            'welf_anti',
            'redist_anti',
            #'lean_right',
            'disab_noAff',
            'rel_other',

            'adults18_64',
            'adults65pl',
            'female',
            'male',

            
           ], axis=1, inplace=True)


In [None]:
corr_colourmap2 = bsa_z.corr()
corr_colourmap2.style.background_gradient(cmap='coolwarm')

In [None]:
highly_correlated2 = (corr_colourmap2.abs() > threshold) & (corr_colourmap2.abs() < 1.0)

plt.figure(figsize=(10, 8))
sns.heatmap(highly_correlated2, cmap='coolwarm', cbar=False, annot=True)

plt.title('Highly Correlated Variables')
plt.savefig("data/images/matrix2.png") # saving the output as it's too large to screenshot
plt.show()

## Step 6: k-Means clustering
The first step to calculating our clusters is to to calculate first how many clusters would fit our data. We can do this with the elbow method, as is shown below:

In [None]:
Sum_of_squared_distances = [] 

K_range = range(1,10) # creating a range of potential clusters

for k in K_range:
 km = KMeans(n_clusters=k, random_state=0)
 km = km.fit(bsa_z)
 Sum_of_squared_distances.append(km.inertia_)
    
plt.plot(K_range, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

The elbow method shows that the number of clusters that would best fit our data is 2 clusters, maybe up to 5, as this is where the 'elbow' of the line graph is. We also caluclate the between-custer sum of squares to compare results.

In [None]:
def elbow(dataframe, n): # defining a function for the elbow method
    kMeansVar = [KMeans(n_clusters=k, random_state=0).fit(dataframe.values) for k in range(1, n)]
    centroids = [X.cluster_centers_ for X in kMeansVar]
    k_euclid = [cdist(dataframe.values, cent) for cent in centroids]
    dist = [np.min(ke, axis=1) for ke in k_euclid]
    wcss = [sum(d**2) for d in dist]
    tss = sum(pdist(dataframe.values)**2)/dataframe.values.shape[0]
    bss = tss - wcss
    plt.plot(bss)
    plt.xlabel('k')
    plt.ylabel('Between-cluster sum of squares')
    plt.title('Elbow Method For Optimal k')
    plt.show()
 
elbow(bsa_z,10)

Comparing the sum of squared distances to the results of the between-cluster sum of squares, the between-cluster sum appears to suggest that around 4 clusters is the peak number. In our first attempt we will go forward with 3 clusters as a first test, and adjust for more tests.

In [None]:
# making a deep copy of the bsa_z dataframe to use to test the use of three clusters
bsa_z3 = bsa_z.copy() # making a deep copy

In [None]:
kmeans3 = KMeans(n_clusters=3, random_state=0) # defining our cluster number as 3 and defining a seed
kmeans3.fit(bsa_z3)
labels = kmeans3.predict(bsa_z3)
cluster_centres3 = kmeans3.cluster_centers_

bsa_z3['Cluster'] = kmeans3.labels_

In [None]:
bsa_z3.to_csv("data/bsa_z3k.csv", index=False) # saving this dataset with 3 clusters

Next we need to evaluate our cluster groups using PCA

In [None]:
clusters3 = kmeans3.fit_predict(bsa_z3)

bsa_z3['Cluster'] = clusters3

scaler3 = StandardScaler()
stand_data_scaled3 = scaler3.fit_transform(bsa_z3)

# PCA analysis
pca3 = PCA(n_components=2).fit(stand_data_scaled3)
pca_result3 = pca3.transform(stand_data_scaled3)

# calculating percentage variance
variance_ratio3 = pca3.explained_variance_ratio_

In [None]:
# printing a scatter plot for results
plt.figure(figsize=(12, 8))
fig = px.scatter(x=pca_result3[:, 0], y=pca_result3[:, 1], color=clusters3,
                 labels={'color': 'Cluster'},
                 title='York BSA Synthetic Dataset: Cluster Plot against first two Principal Components',
                 opacity=0.7,
                 width=800, 
                 height=800)

plt.tight_layout()
fig.show()

print(f"These two components explain {(variance_ratio3.sum()*100):.2f}% of the point variability.")


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_result3[:, 0], y=pca_result3[:, 1], hue=clusters3, palette='viridis', s=50, alpha=0.7)
plt.title('Cluster Plot against 1st 2 Principal Components')
plt.xlabel(f'Principal Component 1 variation: {variance_ratio3[0]*100:.2f}%')
plt.ylabel(f'Principal Component 2 variation: {variance_ratio3[1]*100:.2f}%')
plt.legend(title='Clusters')
plt.show()

Interpreting our cluster centres - creating a dataframe with the coordinates of cluster centres

In [None]:
cluster_centers_k3 = pd.DataFrame(kmeans3.cluster_centers_, columns=bsa_z3.columns, )
cluster_centers_k3.to_csv("data/cluster_centers_k3.csv", index=False)
cluster_centers_k3.head()

### Step 7: Looking at the characteristics of our clusters

This is for 2 clusters, however, we will test out 3 and 4 clusters too to see if we can get a better fit. 

In [None]:
first_row_centers_k3 = cluster_centers_k3.iloc[0, :]

# saving the length of features
num_features_k3 = len(first_row_centers_k3)

# getting the polar coordinates
theta_k3 = np.linspace(0, 2 * np.pi, num_features_k3, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k3, first_row_centers_k3, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k3, np.zeros_like(first_row_centers_k3), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k3)
ax.set_xticklabels(cluster_centers_k3.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 1")
plt.show()

In [None]:
second_row_centers_k3 = cluster_centers_k3.iloc[1, :]

# saving the length of features
num_features_k3 = len(second_row_centers_k3)

# getting the polar coordinates
theta_k3 = np.linspace(0, 2 * np.pi, num_features_k3, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k3, second_row_centers_k3, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k3, np.zeros_like(second_row_centers_k3), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k3)
ax.set_xticklabels(cluster_centers_k3.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 2")
plt.show()

In [None]:
third_row_centers_k3 = cluster_centers_k3.iloc[2, :]

# saving the length of features
num_features_k3 = len(third_row_centers_k3)

# getting the polar coordinates
theta_k3 = np.linspace(0, 2 * np.pi, num_features_k3, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k3, third_row_centers_k3, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k3, np.zeros_like(third_row_centers_k3), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k3)
ax.set_xticklabels(cluster_centers_k3.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 3")
plt.show()

## Step 7: Testing different numbers of clusters.

### Test 1: Two Clusters

In [None]:
bsa_z2 = bsa_z.copy() # making a deep copy

In [None]:
kmeans2 = KMeans(n_clusters=2, random_state=0) # defining our cluster number as 2 and defining a seed
kmeans2.fit(bsa_z2)
labels = kmeans2.predict(bsa_z2)
cluster_centres2 = kmeans2.cluster_centers_

bsa_z2['Cluster'] = kmeans2.labels_

In [None]:
bsa_z2.to_csv("data/bsa_z2k.csv", index=False) # saving this dataset with 2 clusters

In [None]:
clusters2 = kmeans2.fit_predict(bsa_z2)

bsa_z2['Cluster'] = clusters2

# Standardize the data for PCA
scaler2 = StandardScaler()
stand_data_scaled2 = scaler2.fit_transform(bsa_z2)

pca2 = PCA(n_components=2).fit(stand_data_scaled2)
pca_result2 = pca2.transform(stand_data_scaled2)

# calculating percentage variance
variance_ratio2 = pca2.explained_variance_ratio_

In [None]:
# printing a scatter plot for results
plt.figure(figsize=(12, 8))
fig = px.scatter(x=pca_result2[:, 0], y=pca_result2[:, 1], color=clusters2,
                 labels={'color': 'Cluster'},
                 title='York BSA Synthetic Dataset: Cluster Plot against first two Principal Components',
                 opacity=0.7,
                 width=800, 
                 height=800)

plt.tight_layout()
fig.show()

print(f"These two components explain {(variance_ratio2.sum()*100):.2f}% of the point variability.")

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_result2[:, 0], y=pca_result2[:, 1], hue=clusters2, palette='viridis', s=50, alpha=0.7)
plt.title('Cluster Plot against 1st 2 Principal Components')
plt.xlabel(f'Principal Component 1 variation: {variance_ratio2[0]*100:.2f}%')
plt.ylabel(f'Principal Component 2 variation: {variance_ratio2[1]*100:.2f}%')
plt.legend(title='Clusters')
plt.show()

In [None]:
cluster_centers_k2 = pd.DataFrame(kmeans2.cluster_centers_, columns=bsa_z2.columns, )
cluster_centers_k2.to_csv("data/cluster_centers_k2.csv", index=False)
cluster_centers_k2.head()

In [None]:
first_row_centers_k2 = cluster_centers_k2.iloc[0, :]

# saving the length of features
num_features_k2 = len(first_row_centers_k2)

# getting the polar coordinates
theta_k2 = np.linspace(0, 2 * np.pi, num_features_k2, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k2, first_row_centers_k2, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k2, np.zeros_like(first_row_centers_k2), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k2)
ax.set_xticklabels(cluster_centers_k2.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 1")
plt.show()

In [None]:
second_row_centers_k2 = cluster_centers_k2.iloc[1, :]

# saving the length of features
num_features_k2 = len(second_row_centers_k2)

# getting the polar coordinates
theta_k2 = np.linspace(0, 2 * np.pi, num_features_k2, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k2, second_row_centers_k2, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k2, np.zeros_like(second_row_centers_k2), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k2)
ax.set_xticklabels(cluster_centers_k2.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 2")
plt.show()

### Test 2: Four Clusters

In [None]:
bsa_z4 = bsa_z.copy() # making a deep copy

In [None]:
kmeans4 = KMeans(n_clusters=4, random_state=0) # defining our cluster number as 4 and defining a seed
kmeans4.fit(bsa_z4)
labels = kmeans4.predict(bsa_z4)
cluster_centres4 = kmeans4.cluster_centers_

bsa_z4['Cluster'] = kmeans4.labels_

In [None]:
bsa_z4.to_csv("data/bsa_z4k.csv", index=False) # saving this dataset with 4 clusters

In [None]:
clusters4 = kmeans4.fit_predict(bsa_z4)

bsa_z4['Cluster'] = clusters4

# Standardize the data for PCA
scaler4 = StandardScaler()
stand_data_scaled4 = scaler4.fit_transform(bsa_z4)

pca4 = PCA(n_components=2).fit(stand_data_scaled4)
pca_result4 = pca4.transform(stand_data_scaled4)

# calculating percentage variance
variance_ratio4 = pca4.explained_variance_ratio_

In [None]:
# printing a scatter plot for results
plt.figure(figsize=(12, 8))
fig = px.scatter(x=pca_result4[:, 0], y=pca_result4[:, 1], color=clusters4,
                 labels={'color': 'Cluster'},
                 title='York BSA Synthetic Dataset: Cluster Plot against first two Principal Components',
                 opacity=0.7,
                 width=800, 
                 height=800)

plt.tight_layout()
fig.show()

print(f"These two components explain {(variance_ratio4.sum()*100):.2f}% of the point variability.")

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_result4[:, 0], y=pca_result4[:, 1], hue=clusters4, palette='viridis', s=50, alpha=0.7)
plt.title('Cluster Plot against 1st 2 Principal Components')
plt.xlabel(f'Principal Component 1 variation: {variance_ratio4[0]*100:.2f}%')
plt.ylabel(f'Principal Component 2 variation: {variance_ratio4[1]*100:.2f}%')
plt.legend(title='Clusters')
plt.show()

In [None]:
cluster_centers_k4 = pd.DataFrame(kmeans4.cluster_centers_, columns=bsa_z4.columns, )
cluster_centers_k4.to_csv("data/cluster_centers_k4.csv", index=False)
cluster_centers_k4.head()

In [None]:
# looking at the characteristics of the first cluster
first_row_centers_k4 = cluster_centers_k4.iloc[0, :]

# saving the length of features
num_features_k4 = len(first_row_centers_k4)

# getting the polar coordinates
theta_k4 = np.linspace(0, 2 * np.pi, num_features_k4, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k4, first_row_centers_k4, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k4, np.zeros_like(first_row_centers_k4), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k4)
ax.set_xticklabels(cluster_centers_k4.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 1")
plt.show()

In [None]:
# looking at the characteristics of the second cluster
second_row_centers_k4 = cluster_centers_k4.iloc[1, :]

# saving the length of features
num_features_k4 = len(second_row_centers_k4)

# getting the polar coordinates
theta_k4 = np.linspace(0, 2 * np.pi, num_features_k4, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k4, second_row_centers_k4, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k4, np.zeros_like(second_row_centers_k4), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k4)
ax.set_xticklabels(cluster_centers_k4.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 2")
plt.show()

In [None]:
# looking at the characteristics of the third cluster
third_row_centers_k4 = cluster_centers_k4.iloc[2, :]

# saving the length of features
num_features_k4 = len(third_row_centers_k4)

# getting the polar coordinates
theta_k4 = np.linspace(0, 2 * np.pi, num_features_k4, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k4, third_row_centers_k4, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k4, np.zeros_like(third_row_centers_k4), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k4)
ax.set_xticklabels(cluster_centers_k4.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 3")
plt.show()

In [None]:
# looking at the characteristics of the forth cluster
forth_row_centers_k4 = cluster_centers_k4.iloc[3, :]

# saving the length of features
num_features_k4 = len(forth_row_centers_k4)

# getting the polar coordinates
theta_k4 = np.linspace(0, 2 * np.pi, num_features_k4, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
ax.plot(theta_k4, forth_row_centers_k4, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k4, np.zeros_like(third_row_centers_k4), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k4)
ax.set_xticklabels(cluster_centers_k4.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 4")
plt.show()

### Test 3: Five Clusters

Carrying this out after testing clustering without age and sex included.

In [None]:
bsa_z5 = bsa_z.copy() # making a deep copy
kmeans5 = KMeans(n_clusters=5, random_state=0) # defining our cluster number as 5 and defining a seed
kmeans5.fit(bsa_z5)
labels = kmeans5.predict(bsa_z5)
cluster_centres5 = kmeans5.cluster_centers_

bsa_z5['Cluster'] = kmeans5.labels_
bsa_z5.to_csv("data/bsa_z5k.csv", index=False) # saving this dataset with 5 clusters

In [None]:
clusters5 = kmeans5.fit_predict(bsa_z5)

bsa_z5['Cluster'] = clusters5

# Standardize the data for PCA
scaler5 = StandardScaler()
stand_data_scaled5 = scaler5.fit_transform(bsa_z5)

pca5 = PCA(n_components=2).fit(stand_data_scaled5)
pca_result5 = pca5.transform(stand_data_scaled5)

# calculating percentage variance
variance_ratio5 = pca5.explained_variance_ratio_

In [None]:
# printing a scatter plot for results
plt.figure(figsize=(12, 8))
fig = px.scatter(x=pca_result5[:, 0], y=pca_result5[:, 1], color=clusters5,
                 labels={'color': 'Cluster'},
                 title='York BSA Synthetic Dataset: Cluster Plot against first two Principal Components',
                 opacity=0.7,
                 width=800, 
                 height=800)

plt.tight_layout()
fig.show()

print(f"These two components explain {(variance_ratio5.sum()*100):.2f}% of the point variability.")

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_result5[:, 0], y=pca_result5[:, 1], hue=clusters5, palette='viridis', s=50, alpha=0.7)
plt.title('Cluster Plot against 1st 2 Principal Components')
plt.xlabel(f'Principal Component 1 variation: {variance_ratio5[0]*100:.2f}%')
plt.ylabel(f'Principal Component 2 variation: {variance_ratio5[1]*100:.2f}%')
plt.legend(title='Clusters')
plt.show()

In [None]:
cluster_centers_k5 = pd.DataFrame(kmeans5.cluster_centers_, columns=bsa_z5.columns, )
cluster_centers_k5.to_csv("data/cluster_centers_k5.csv", index=False)
cluster_centers_k5.head()

In [None]:
# looking at the characteristics of the first cluster
first_row_centers_k5 = cluster_centers_k5.iloc[0, :]

# saving the length of features
num_features_k5 = len(first_row_centers_k5)

# getting the polar coordinates
theta_k5 = np.linspace(0, 2 * np.pi, num_features_k5, endpoint=True)

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
 # plots the centres
ax.plot(theta_k5, first_row_centers_k5, linewidth=1, color='blue', marker='o', label='Centers')

 # adding the red line into the plot where the value is 0.0
ax.plot(theta_k5, np.zeros_like(first_row_centers_k5), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k5)
ax.set_xticklabels(cluster_centers_k5.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 1")
plt.show()

In [None]:
# looking at the characteristics of the second cluster
second_row_centers_k5 = cluster_centers_k5.iloc[1, :]

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
ax.plot(theta_k5, second_row_centers_k5, linewidth=1, color='blue', marker='o', label='Centers')
ax.plot(theta_k5, np.zeros_like(second_row_centers_k5), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k5)
ax.set_xticklabels(cluster_centers_k5.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 2")
plt.show()

In [None]:
# looking at the characteristics of the third cluster
third_row_centers_k5 = cluster_centers_k5.iloc[2, :]

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
ax.plot(theta_k5, third_row_centers_k5, linewidth=1, color='blue', marker='o', label='Centers')
ax.plot(theta_k5, np.zeros_like(third_row_centers_k5), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k5)
ax.set_xticklabels(cluster_centers_k5.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 3")
plt.show()

In [None]:
# looking at the characteristics of the forth cluster
forth_row_centers_k5 = cluster_centers_k5.iloc[3, :]

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
ax.plot(theta_k5, forth_row_centers_k5, linewidth=1, color='blue', marker='o', label='Centers')
ax.plot(theta_k5, np.zeros_like(forth_row_centers_k5), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k5)
ax.set_xticklabels(cluster_centers_k5.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 4")
plt.show()

In [None]:
# looking at the characteristics of the fifth cluster
fifth_row_centers_k5 = cluster_centers_k5.iloc[4, :]

fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
ax.plot(theta_k5, fifth_row_centers_k5, linewidth=1, color='blue', marker='o', label='Centers')
ax.plot(theta_k5, np.zeros_like(fifth_row_centers_k5), color='red', linestyle='--', label='Avarage')

ax.set_xticks(theta_k5)
ax.set_xticklabels(cluster_centers_k5.columns, rotation=45, ha='right')
plt.title("Characteristics of Cluster 5")
plt.show()

## Step 8: Mapping the clusters

From our testing we've settled on two clusters, which clusters areas 'right-leaning' and 'left-leaning'. The final step to our geodemographic is then mapping these clusters. Therefore we'll reimport the z-score dataframe we created based upon segmenting our zones into two clusters. We'll remove all of the z-score columns we've created, and add back all of the aggregate data we started with.

In [None]:
bsa_zscores = bsa_z2.copy()

In [None]:
bsa_zscores.columns

In [None]:
bsa_zscores.drop([
    'polInter_yes',
    'welf_pro',
    'redist_pro', 
    'lean_left', 'lean_right', 'lean_centr',
    'rel_christ', 'rel_nonchrist', 'rel_non', 
    'nat_british', 'nat_english',
    'race_white', 'race_asian', 'race_mixed', 'race_other',
    'disab_aff'], axis=1, inplace=True)
bsa_zscores['Cluster'] = bsa_zscores['Cluster'].astype(str)
bsa_zscores.info()

In [None]:
final_BSA = pd.concat([BSA_geom, bsa_zscores], axis=1, ignore_index=False)
final_BSA.head()

In [None]:
def rename_column(x):
    x = x.replace('0', 'Left-leaning')
    x = x.replace('1', 'Right-leaning or Centrist')
    return x

final_BSA['Cluster'] = final_BSA['Cluster'].apply(rename_column)

In [None]:
final_BSA.head()

In [None]:
final_BSA.explore(column='Cluster', cmap='bwr', tiles='OpenStreetMap')