**Libraries**

In [None]:
import sqlite3
import pandas as pd
import numpy as np

<br>
<br>

## Reading the Data


### Database connection instance

In [None]:
class SQLITE:

    def __init__(self, databaseuri: str):
        """
        
        :param databaseuri: The URI of a database file
        """
        
        self.databaseuri = databaseuri

    def connecting(self):
        """
        
        :return: database connection instance
        """

        connection = None

        try:
            connection = sqlite3.connect(self.databaseuri)
        except ConnectionError as err:
            raise Exception(err.strerror) in err

        return connection

In [None]:
sqlite = SQLITE(databaseuri='../data/gpinhours.sqlite')

<br>
<br>

### Data reading

In [None]:
connection = sqlite.connecting()
query = 'SELECT * FROM inhours'
inhours = pd.read_sql_query(query, connection)
connection.close()

In [None]:
inhours.info()

<br>
<br>

## Data Cleaning

In [None]:
inhours.head()

<br>
<br>

### Fix the population count

<br>

**Create function**

In [None]:
def as_number_or_nan(x):
    return pd.to_numeric(x, errors='coerce')
    

In [None]:
inhours.loc[:, 'POP'] = as_number_or_nan(x = inhours.TOTAL_POP)

In [None]:
inhours.head()

<br>

**Missing values**

In [None]:
inhours.loc[np.isnan(inhours.POP), 'NAME'].value_counts()

<br>
<br>

### Fix the date

In [None]:
inhours.loc[:, 'date'] = pd.to_datetime(inhours.lastdate, unit='D', origin='unix')

In [None]:
inhours.date.describe(datetime_is_numeric=True)

<br>
<br>

### Trim the years

In [None]:
inhours.loc[:, 'year'] = inhours.date.dt.year

In [None]:
inhours.year.value_counts()

In [None]:
inhours.shape

<br>

Reduce

In [None]:
inhours = inhours.loc[inhours.year != 2020, :]

In [None]:
inhours.year.value_counts()

In [None]:
inhours.shape

<br>
<br>

### Drop zero populations

In [None]:
inhours = inhours.loc[inhours.POP != 0, :]

In [None]:
inhours = inhours.loc[~np.isnan(inhours.POP), :]

<br>
<br>

### Final clean data

In [None]:
inhours

<br>
<br>

## North South Divide

<br>

### Read the table

In [None]:
connection = sqlite.connecting()
query = 'SELECT * FROM localauth'
localauth = pd.read_sql_query(query, connection)
connection.close()

In [None]:
localauth.info()

In [None]:
localauth.head()

<br>

### Merge with GP data

In [None]:
inhoursNS = inhours.merge(localauth, how='left', on='NAME')

In [None]:
inhoursNS

<br>
<br>

### Group and aggregate

In [None]:
fields = ['NS', 'year', 'POP', 'Flu_OBS', 'Vom_OBS', 'Diarr_OBS', 'Gastro_OBS']

In [None]:
# Either
# inhoursNS[fields].groupby(by=['NS', 'year']).agg('sum')


# Or
aggNS = inhoursNS[fields].groupby(by=['NS', 'year']).agg(Total=('POP', sum), 
                                                         Flu_OBS=('Flu_OBS', sum), 
                                                         Vom_OBS=('Vom_OBS', sum), 
                                                         Diarr_OBS=('Diarr_OBS', sum), 
                                                         Gastro_OBS=('Gastro_OBS', sum))
aggNS

<br>
<br>

### Consultation rates