-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_wimd_msoa.R
149 lines (111 loc) · 6.54 KB
/
generate_wimd_msoa.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
library(tidyverse)
lsoa_wimd <- readODS::read_ods("data/wimd-2019-index-and-domain-scores-by-small-area_0.ods", sheet=2, skip=2) %>% tibble() %>%
mutate(`WIMD 2019 Decile` = ntile(`WIMD 2019`, 10))
lsoa_pops <- readxl::read_xlsx("data/SAPE21DT1a-mid-2018-on-2019-LA-lsoa-syoa-estimates-formatted.xlsx", sheet=4, skip=3) %>%
filter(`Area Codes` %>% str_detect("^W01")) %>%
rename(`LSOA Code` = `Area Codes`)
lsoa_wimd <- lsoa_wimd %>%
left_join(lsoa_pops %>% select(`LSOA Code`, `All Ages`), by="LSOA Code")
stopifnot(lsoa_wimd %>% filter(is.na(`All Ages`)) %>% nrow() == 0 )
lsoa_to_msoa <- readODS::read_ods("data/Geography lookups - match LSOA to different geography groups.ods", sheet=2) %>%
tibble() %>%
rename(
`LSOA Code` = `Lower Layer Super Output Area (LSOA) Code`,
`MSOA Code` = `Middle Layer Super Output Area (MSOA) Code`,
`MSOA Name` = `Middle Layer Super Output Area (MSOA) Name`,
)
print("Source data loaded.")
lsoa_wimd <- lsoa_wimd %>%
left_join(lsoa_to_msoa %>% select(`LSOA Code`, `MSOA Code`, `MSOA Name`), by="LSOA Code")
stopifnot(lsoa_wimd %>% filter(is.na(`MSOA Code`)) %>% nrow() == 0 )
msoa_wimd <- lsoa_wimd %>%
mutate(Pop.weighted.WIMD = `WIMD 2019` * `All Ages`) %>%
group_by(`MSOA Code`, `MSOA Name`) %>%
summarise(
`Total population` = sum(`All Ages`),
`Pop-weighted WIMD score 2019` = sum(Pop.weighted.WIMD)/(`Total population`),
`Local Authority English Name` = first(`Local Authority Name`),
`Proportion of Population in 10% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 10))/(`Total population`),
`Proportion of Population in 20% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 9))/(`Total population`),
`Proportion of Population in 30% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 8))/(`Total population`),
`Proportion of Population in 40% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 7))/(`Total population`),
`Proportion of Population in 50% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 6))/(`Total population`),
`Proportion of Population in 60% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 5))/(`Total population`),
`Proportion of Population in 70% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 4))/(`Total population`),
`Proportion of Population in 80% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 3))/(`Total population`),
`Proportion of Population in 90% Most Deprived LSOAs` = sum(`All Ages`*(`WIMD 2019 Decile` >= 2))/(`Total population`)
) %>%
ungroup()
#
# Repeated re-ranking
#
msoa_count <- msoa_wimd %>% nrow()
weight = msoa_count/10
msoa_wimd <- msoa_wimd %>% mutate(
CombinedRankScore = rank(`Proportion of Population in 50% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 30% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 70% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 20% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 80% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 10% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 70% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 40% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count/weight * CombinedRankScore) + rank(`Proportion of Population in 60% Most Deprived LSOAs`, ties.method='min'),
CombinedRankScore = rank(CombinedRankScore, ties.method='min'),
CombinedRankScore = (msoa_count * CombinedRankScore) + rank(`Pop-weighted WIMD score 2019`, ties.method='min'),
`Pseudo-WIMD 2019 rank` = rank(CombinedRankScore),
CombinedRankScore = NULL,
)
#
# Ntile comparisons
#
ntiles_n = 5
for_ntile_comparisons <- msoa_wimd %>%
mutate(`Pseudo-WIMD 2019 rank ntile` = ntile(`Pseudo-WIMD 2019 rank`, ntiles_n)) %>%
pivot_longer(starts_with('Proportion of Population')) %>%
group_by(name) %>%
mutate(
value_ntile = ntile(value, ntiles_n)
) %>%
ungroup() %>%
filter(!value %in% c(0,1)) %>%
mutate(value = value_ntile, value_ntile = NULL) %>%
group_by(`Pseudo-WIMD 2019 rank ntile`, name, value) %>% count() %>%
mutate(
name = name %>% str_replace_all("Proportion of Population in", "…"),
discrepancy = abs(value-`Pseudo-WIMD 2019 rank ntile`)
)
ntile_comparison_plot <- ggplot(for_ntile_comparisons, aes(x=`Pseudo-WIMD 2019 rank ntile`, y=`value`, size=n, colour=discrepancy)) +
facet_wrap(.~name) +
geom_point() + theme_minimal() +
scale_x_continuous(breaks=1:ntiles_n, minor_breaks=c(), expand=expansion(add=0.5)) +
scale_y_continuous(breaks=1:ntiles_n, minor_breaks=c(), expand=expansion(add=0.5)) +
labs(
x = "Pseudo-WIMD 2019 rank ntile",
y = "MSOA ntile based on % population resident in …"
) + scale_color_gradient(low='blue', high='red')
ggsave('output/ntile_comparison.png', ntile_comparison_plot, width=7.5, height=7, units='in')
#
# End ntile comparison
#
msoa_hocl_names <- read_csv("data/MSOA-Names-v1.1.0.csv") %>%
filter(msoa11cd %>% str_detect("^W")) %>%
rename(
`MSOA Code` = msoa11cd,
`MSOA HoCL English Name` = msoa11hclnm,
`MSOA HoCl Welsh Name` = msoa11hclnmw
)
msoa_wimd <- msoa_wimd %>%
left_join(
msoa_hocl_names %>% select(`MSOA Code`, `MSOA HoCL English Name`, `MSOA HoCl Welsh Name`),
by = "MSOA Code"
)
msoa_wimd %>% mutate(`Pseudo-WIMD 2019 rank` = msoa_count-`Pseudo-WIMD 2019 rank`+1) %>% arrange(`Pseudo-WIMD 2019 rank`) %>% write_csv("output/wimd_msoa.csv")