# GEOG5990M Final Assignment (Template)

Student ID number: 201746746

In [15]:
# read in required packages
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

First load the occupation data and fuel poverty data, and filter out the data for the Leeds area for data cleaning.

In [16]:
# Load data
occupation_data = pd.read_csv('census2021_ts063_lsoa.csv')
fuel_poverty_data = pd.read_csv('Fuel_poverty_by_LSOA.csv')
leeds_geo = gpd.read_file('Leeds.geojson')
# Data preview and clean
print(occupation_data.info())
print(fuel_poverty_data.info())
print(leeds_geo.info())
leeds_occupation_data = occupation_data[occupation_data['geography'].str.contains("Leeds", na=False)]
leeds_fuel_poverty_data = fuel_poverty_data[fuel_poverty_data['LSOA Name'].str.contains("Leeds", na=False)]
leeds_occupation_data.dropna(inplace=True)
leeds_fuel_poverty_data.dropna(inplace=True)

Calculate the proportion of low-income occupations in each LSOA

In [None]:
low_income_occupation_columns = [
    'Occupation (current): 7. Sales and customer service occupations',
    'Occupation (current): 8. Process, plant and machine operatives',
    'Occupation (current): 9. Elementary occupations'
]
for column in low_income_occupation_columns:
    leeds_occupation_data[column + ' (%)'] = leeds_occupation_data[column] / leeds_occupation_data['Occupation (current): Total: All usual residents aged 16 years and over in employment the week before the census'] * 100
leeds_occupation_data['Low Income Occupations (%)'] = leeds_occupation_data[[col + ' (%)' for col in low_income_occupation_columns]].sum(axis=1)

Occupation data and fuel poverty data are merged to create a new index expressing the proportion of low-income occupations in relation to the proportion of fuel poverty. Normalize the index to be between 0-1.

In [None]:
merged_data = pd.merge(leeds_occupation_data, leeds_fuel_poverty_data, left_on='geography code', right_on='LSOA Code')
merged_data['Poverty and Fuel Poverty Index'] = merged_data['Proportion of households fuel poor (%)'] * merged_data['Low Income Occupations (%)']
scaler = MinMaxScaler()
merged_data['Normalized Poverty and Fuel Poverty Index'] = scaler.fit_transform(merged_data[['Poverty and Fuel Poverty Index']])
print(merged_data[['geography', 'Poverty and Fuel Poverty Index', 'Normalized Poverty and Fuel Poverty Index']].head())

K-means clustering was used to classify the data into three categories, based on the proportion of low-income occupations and the proportion of fuel poverty.

In [None]:
# K-means clustering analysis
kmeans = KMeans(n_clusters=3, random_state=42)
merged_data['Cluster'] = kmeans.fit_predict(merged_data[['Low Income Occupations (%)', 'Proportion of households fuel poor (%)']])

This is an example block of mardown text I want to reference <a href="#ref1">[1]</a>. I might need to add some more citations <a href="#ref2">[2]</a><a href="#ref2">[3]</a>



Data Visualisation
- Don't forget to present your final two data visualisation (one spatial and one non-spatial) and the justifications about the decisions you made whilst preparing and visualising the data.

Non-spatial visualization: Use scatter plots to display K-means clustering results.

In [None]:
# Non-spatial visualization of K-means clustering results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=merged_data['Low Income Occupations (%)'], y=merged_data['Proportion of households fuel poor (%)'], hue=merged_data['Cluster'], palette='viridis')
plt.xlabel('Low Income Occupations (%)')
plt.ylabel('Proportion of Households Fuel Poor (%)')
plt.title('K-means Clustering Results: Low Income Occupations vs. Fuel Poverty')
plt.legend(title='Cluster')
plt.show()

Spatial visualization: Create three maps showing the normalized index, the proportion of low-income occupations and the proportion of fuel poverty.

In [None]:
# Merge fuel poverty data with Leeds geo data
leeds_geo = leeds_geo.merge(merged_data, left_on='LSOA21CD', right_on='LSOA Code')

# Map of normalized poverty and fuel poverty index
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
leeds_geo.plot(column='Normalized Poverty and Fuel Poverty Index', ax=ax, legend=True,
               legend_kwds={'label': "Normalized Poverty and Fuel Poverty Index",
                            'orientation': "horizontal"})
ax.set_title('Spatial Distribution of Normalized Poverty and Fuel Poverty Index in Leeds')
plt.show()

# Map of low-income occupation proportions
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
leeds_geo.plot(column='Low Income Occupations (%)', ax=ax, legend=True,
               legend_kwds={'label': "Low Income Occupations (%)",
                            'orientation': "horizontal"})
ax.set_title('Spatial Distribution of Low Income Occupations (%) in Leeds')
plt.show()

# Map of fuel poverty proportions
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
leeds_geo.plot(column='Proportion of households fuel poor (%)', ax=ax, legend=True,
               legend_kwds={'label': "Proportion of Households Fuel Poor (%)",
                            'orientation': "horizontal"})
ax.set_title('Spatial Distribution of Fuel Poverty Proportion in Leeds')
plt.show()

## References
<p><a href="add_url_here">[1]</a>Footnote citation goes here</p>
<p><a href="https://github.com/FrancescaPontin/GEOG5990">[2]</a>GEOG5990M course materials</p>
<p><a href="add_url_here">[3]</a>Citation 3</p>