-
Notifications
You must be signed in to change notification settings - Fork 0
/
Clustering model.py
255 lines (206 loc) · 9.6 KB
/
Clustering model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# -*- coding: utf-8 -*-
"""
Last modified on 31 July 2020
@author: Dr Sen Hu (UCD)
This file corresponds to SOME PARTS OF the results concerning ONLY Irish census data in the paper:
Hu et al (2020), "A spatial machine learning model for analyzing customers' lapse behaviour in life insurance", Annals of Actuarial Science.
Note that only data set of Dublin is considered for the analysis.
"""
#%%
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#%%
"""
Load the filtered census data set (for Dublin)
Omitted data directory here, fill in your own directory for the data set
And explore the data
"""
census = pd.read_csv('NewCensusData_final_Dublin.csv')
census = census.drop(census.columns[0], axis=1)
census.head()
census.shape
census.columns
census = census.dropna(axis=0, how='any')
census["Age0_4"].describe()
census["Age5_14"].describe()
census["Age25_44"].describe()
census["Age65over"].describe()
census["EU_National"].describe()
census["ROW_National"].describe()
#%%
"""
***********
* Model 2 *
***********
"""
#%%
"""
Phase 1 of Model 2: custering census data using k-medoids
No need to re-scale the census data since all variables are of percnetage unit.
Use the elbow method (elbow plot) to decide the optimum value of K (number of clusters).
"""
from sklearn_extra.cluster import KMedoids
pc_data = census.iloc[:, 0:69]
pc_data.head()
pc_data.describe()
sse = {}
for k in range(1, 21):
kmedoids = KMedoids(n_clusters=k, random_state=10)
kmedoids.fit(pc_data)
sse[k] = kmedoids.inertia_
plt.xlabel('Number of clustering components')
plt.ylabel('Sum of Distance')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
"""
K=8 is chosen based on the elbow plot.
Note that detailed analysis reasoning is included in the paper.
"""
kmedoids = KMedoids(n_clusters=8, random_state=10)
kmedoids.fit(pc_data)
cluster_labels = kmedoids.labels_
census["cluster1"] = cluster_labels+1
census["cluster1"].value_counts()
census.head()
#%%
"""
Now 8 census clusters are found
We can profile these clusters i.e. finding key characteristics of each cluster.
The first way to profile them is to compare their census summary information across.
"""
# set the color pallette for plotting, consistent with those in the paper.
my_color = {1:"thistle", 2:"burlywood",3:"palegoldenrod",4:"lightpink",5:"paleturquoise",6:"lightgrey",7:"lightsteelblue",8:"darkseagreen"}
# Demographic information
sns.boxplot(x="cluster1", y="Age5_14", data=census, palette=my_color).set(xlabel='Cluster',ylabel='Age 5-14 population percentage')
sns.boxplot(x="cluster1", y="Age25_44", data=census, palette=my_color).set(xlabel='Cluster',ylabel='Age 25-44 population percentage')
sns.boxplot(x="cluster1", y="Age65over", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Age 65 and over population percentage")
sns.boxplot(x="cluster1", y="Born_outside_Ireland", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Born outside Ireland population percentage")
# Household decomposition
sns.boxplot(x="cluster1", y="HouseShare", data=census, palette=my_color).set(xlabel='Cluster',ylabel='House share percentage')
sns.boxplot(x="cluster1", y="NonDependentKids", data=census, palette=my_color).set(xlabel="Cluster",ylabel='Family with non-dependent children percentage') #
sns.boxplot(x="cluster1", y="Dink", data=census, palette=my_color).set(xlabel="Cluster", ylabel='Couple with no children percentage')
sns.boxplot(x="cluster1", y="Married", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Married couple percentage")
# Housing
sns.boxplot(x="cluster1", y="Flats", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Flat dwelling percentage")
sns.boxplot(x="cluster1", y="RentPublic", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Public rent percentage")
sns.boxplot(x="cluster1", y="RentPrivate", data=census, palette=my_color).set(xlabel="Cluster",ylabel="Private rent percentage")
sns.boxplot(x="cluster1", y="Owned", data=census, palette=my_color).set(xlabel="Cluster",ylabel="Housing owned outright percentage")
# Socio-economic information
sns.boxplot(x="cluster1", y="HE", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Third level and above educated percentage")
sns.boxplot(x="cluster1", y="Employed", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Employment percentage")
sns.boxplot(x="cluster1", y="TwoCars", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Ownership of 2 or moe cars percentage")
sns.boxplot(x="cluster1", y="SC_professional", data=census, palette=my_color).set(xlabel="Cluster", ylabel="Professional social class percentage")
# Employment
sns.boxplot(x="cluster1", y="Unemployed", data=census, palette=my_color).set(xlabel="Cluster",ylabel="Unemployment percentage")
# Misc
sns.boxplot(x="cluster1", y="Internet", data=census, palette=my_color).set(xlabel="Cluster",ylabel="Broadband percentage")
#%%
"""
Another way to profile the census clusters is to plot them on a Dublin map
First download the census small area boundary shape files
Then load the shape files
Omitted data directory here, fill in your own directory
"""
import shapefile as shp
sf = shp.Reader("Census2016_Small_Areas_generalised20m")
# explore the data
len(sf.shapes())
sf.shape()
sf.shapes()
dir(sf.shapes()[1])
sf.shapeRecord(1)
sf.record(1)[0:8]
sf.records()
"""
The shapefiles store all information about each small area,
including descriptive information and boundary geometry information
We need to extract such information properly before using it
by reading a shapefile into a pandas dataframe with a coords column holding the boundary geometry information.
"""
def read_shapefile(sf):
fields = [x[0] for x in sf.fields][1:]
records = sf.records()
shps = [s.points for s in sf.shapes()]
df = pd.DataFrame(columns=fields, data=records)
df = df.assign(coords=shps)
return df
Ireland = read_shapefile(sf)
Ireland.shape
Ireland.head()
Ireland.iloc[1,]
Ireland["coords"][1]
Ireland["COUNTYNAME"].value_counts()
# extract only Dublin information
Dublin = Ireland[Ireland['COUNTYNAME'].isin(['Fingal','Dublin City','South Dublin','Dún Laoghaire-Rathdown'])]
Dublin.shape
Dublin.columns
Dublin["SMALL_AREA"]
Dublin["GEOGID"]
"""
First we can plot all the small area boundary data
"""
def plot_map(dat, x_lim = None, y_lim = None, figsize = (9,11)):
plt.figure(figsize = figsize)
for shape in dat["coords"]:
x = [ i[0] for i in shape ]
y = [ i[1] for i in shape ]
plt.plot(x, y, 'k:', alpha=.9)
if (x_lim != None) & (y_lim != None):
plt.xlim(x_lim)
plt.ylim(y_lim)
plot_map(Dublin, figsize=(9,11))
plot_map(Dublin, figsize=(9,11), x_lim=(-6.34, -6.11), y_lim=(53.26,53.41))
"""
Combine the census clustering infomation with the boundary shape data
Then we can fill in colours for each small area depending on its cluster membership
"""
census.SAID
census.SMALL_AREA
census_sub = census[["SAID", "cluster1"]]
plotdat = pd.merge(left=Dublin, right=census_sub, how='left', left_on="SMALL_AREA", right_on="SAID")
plotdat.shape
plotdat["COUNTYNAME"].value_counts()
SouthDublin = plotdat[plotdat["COUNTYNAME"].isin(["South Dublin"])]
DublinCity = plotdat[plotdat["COUNTYNAME"].isin(["Dublin City"])]
Fingal = plotdat[plotdat["COUNTYNAME"].isin(["Fingal"])]
DunLR = plotdat[plotdat["COUNTYNAME"].isin(["Dún Laoghaire-Rathdown"])]
# set the color palette, consistent with the plots in the paper
my_color = ['thistle', 'burlywood', 'palegoldenrod', 'lightpink', 'paleturquoise', 'lightgrey', 'lightsteelblue', 'darkseagreen']
def plot_map_fill_cluster_colour(dat, x_lim = None, y_lim = None,
figsize = (12,15),
legend = True,
color = my_color):
plt.figure(figsize = figsize)
fig, ax = plt.subplots(figsize = figsize)
for shape in dat["coords"]:
x = [ i[0] for i in shape ]
y = [ i[1] for i in shape ]
plt.plot(x, y, 'k--',alpha=0.07)
for c in range(1,9):
#print(c)
tempdat = dat.loc[ dat["cluster1"] == c , ]
for id in range(len(tempdat)):
shape_ex = tempdat["coords"].iloc[id]
x_lon = np.zeros((len(shape_ex),1))
y_lat = np.zeros((len(shape_ex),1))
for ip in range(len(shape_ex)):
x_lon[ip] = shape_ex[ip][0]
y_lat[ip] = shape_ex[ip][1]
ax.fill(x_lon,y_lat, color[(c-1)])
if (legend == True):
f = lambda m,c: ax.plot([],[],marker=m, color=c, ls="none")[0]
handles = [f("s", color[i]) for i in range(8)]
labels = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4','Cluster 5','Cluster 6','Cluster 7','Cluster 8']
ax.legend(handles,labels, loc='lower right',frameon=False,
markerscale=4, fontsize='xx-large', bbox_to_anchor=(1.3,0))
if (x_lim != None) & (y_lim != None):
plt.xlim(x_lim)
plt.ylim(y_lim)
plot_map_fill_cluster_colour(plotdat, figsize=(9, 11), legend=True)
plot_map_fill_cluster_colour(plotdat, x_lim=(-6.34, -6.11), y_lim=(53.26,53.41), figsize=(9,11),legend=False)
plot_map_fill_cluster_colour(SouthDublin, figsize=(9,9),legend=False)
plot_map_fill_cluster_colour(DublinCity, figsize=(9,9),legend=False)
plot_map_fill_cluster_colour(Fingal, figsize=(9,9),legend=False)
plot_map_fill_cluster_colour(DunLR, figsize=(9,9),legend=False)