## Preliminaries

### Paths

In [1]:
import os
import pathlib
import sys

In [2]:
if not 'google.colab' in str(get_ipython()):
    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)

<br>
<br>

### Libraries

In [3]:
%matplotlib inline

import logging
import collections

import numpy as np
import pandas as pd
import xlrd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


<br>

### Logging

In [4]:
logging.basicConfig(level=logging.INFO,
                    format='\n%(message)s\n%(asctime)s.%(msecs)03d\n',
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

## Data

<br>

Exploring via single year ages and the year 2012

In [5]:
year = '2012'

<br>

### Population

In [6]:
uri = '../../warehouse/populations/msoa/single/{year}.csv'.format(year=year)

try:
    populations = pd.read_csv(filepath_or_buffer=uri, header=0, encoding='utf-8')
except RuntimeError as err:
    raise Exception(err)
    
populations.head()    

Unnamed: 0,msoa,ltla,sex,0,1,2,3,4,5,6,...,81,82,83,84,85,86,87,88,89,90+
0,E02004297,E06000047,female,48,37,32,44,44,40,39,...,29,19,16,23,17,14,14,13,9,29
1,E02004290,E06000047,female,34,30,38,26,33,40,28,...,12,11,7,8,3,8,6,4,5,16
2,E02004298,E06000047,female,42,50,49,54,48,53,48,...,34,27,26,16,32,21,18,24,10,73
3,E02004299,E06000047,female,52,47,42,40,57,44,38,...,39,22,32,16,13,27,26,24,16,55
4,E02004291,E06000047,female,38,45,49,35,41,32,29,...,25,16,25,21,13,15,8,11,7,42


<br>
<br>

## Aggregates: Uneven


### *function*

In [7]:
def uneven(populations: pd.DataFrame):
    
    agegroups = ['12-15', '16-17', '18-24']
    
    ages = list(np.arange(start=12, stop=25))
    logger.info(ages)
    
    agestext = [str(age) for age in ages]
    logger.info(agestext)
    
    # label = lambda x: 0 if (x < 16) else ( 2 if x > 17 else 1)
    def label(x): return 0 if (x < 16) else (2 if x > 17 else 1)    
    labels = [label(age) for age in ages]
    logger.info(labels)
    
    groups = pd.DataFrame(data = populations[agestext].groupby(by=labels, axis=1).sum())
    
    groups.set_axis(labels=agegroups, axis=1, inplace=True)
    
    return groups
    
    

<br>

### *aggregates*

In [8]:
initial = uneven(populations = populations)
initial.head()


[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
2022-01-10 16:03:05.651


['12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']
2022-01-10 16:03:05.664


[0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2]
2022-01-10 16:03:05.666



Unnamed: 0,12-15,16-17,18-24
0,151,92,260
1,123,63,212
2,223,115,274
3,146,88,340
4,163,74,292


<br>

## Aggregates: Standard

Ages 25 $\rightarrow$ 89


### *function*

In [9]:
def standard(populations: pd.DataFrame):
    
    agegroups = ['25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', 
                 '65-69', '70-74', '75-79', '80-84', '85-89']
    agegrouplength = 5
    
    ages = list(np.arange(start=25, stop=90))
    logger.info(ages)
    
    agestext = [str(age) for age in ages]
    logger.info(agestext)
    
    labels = [index // agegrouplength for index in np.arange(0, len(agestext))]
    logger.info(labels)
    
    groups = pd.DataFrame(data = populations[agestext].groupby(by=labels, axis=1).sum())
    
    groups.set_axis(labels=agegroups, axis=1, inplace=True)
    
    return groups

<br>

### *aggregates*

In [10]:
inbetween = standard(populations = populations)
inbetween.head()


[25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
2022-01-10 16:03:05.712


['25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
2022-01-10 16:03:05.713


[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12]
2022-01-10 16:03:05.714



Unnamed: 0,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85-89
0,234,255,246,286,345,321,256,269,250,178,152,117,67
1,161,175,186,224,282,214,231,220,196,133,107,52,26
2,214,242,314,366,441,378,358,380,291,250,208,134,105
3,246,273,219,330,308,315,295,268,266,180,154,136,106
4,204,210,185,251,303,244,216,204,205,179,133,107,54


<br>
<br>

## Merge

In [11]:
reference = pd.concat((populations[['msoa', 'ltla', 'sex']], 
                       initial,
                       inbetween, 
                       populations[['90+']]), axis=1, ignore_index=False)
reference.head()

Unnamed: 0,msoa,ltla,sex,12-15,16-17,18-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85-89,90+
0,E02004297,E06000047,female,151,92,260,234,255,246,286,345,321,256,269,250,178,152,117,67,29
1,E02004290,E06000047,female,123,63,212,161,175,186,224,282,214,231,220,196,133,107,52,26,16
2,E02004298,E06000047,female,223,115,274,214,242,314,366,441,378,358,380,291,250,208,134,105,73
3,E02004299,E06000047,female,146,88,340,246,273,219,330,308,315,295,268,266,180,154,136,106,55
4,E02004291,E06000047,female,163,74,292,204,210,185,251,303,244,216,204,205,179,133,107,54,42
