**Libraries**

In [None]:
import sqlite3
import pandas as pd
import numpy as np

import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns

<br>

**Settings**

In [None]:
sns.set(font_scale=1)
az.style.use('arviz-darkgrid')

<br>
<br>

## Reading the Data


### Database connection instance

In [None]:
class SQLITE:

    def __init__(self, databaseuri: str):
        """
        
        :param databaseuri: The URI of a database file
        """
        
        self.databaseuri = databaseuri

    def connecting(self):
        """
        
        :return: database connection instance
        """

        connection = None

        try:
            connection = sqlite3.connect(self.databaseuri)
        except ConnectionError as err:
            raise Exception(err.strerror) in err

        return connection

In [None]:
sqlite = SQLITE(databaseuri='../data/gpinhours.sqlite')

<br>
<br>

### Data reading

In [None]:
connection = sqlite.connecting()
query = 'SELECT * FROM inhours'
inhours = pd.read_sql_query(query, connection)
connection.close()

In [None]:
inhours.info()

<br>
<br>

## Data Cleaning

In [None]:
inhours.head()

<br>
<br>

### Fix the population count

<br>

**Create function**

In [None]:
def as_number_or_nan(x):
    return pd.to_numeric(x, errors='coerce')
    

In [None]:
inhours.loc[:, 'POP'] = as_number_or_nan(x = inhours.TOTAL_POP)

In [None]:
inhours.head()

<br>

**Missing values**

In [None]:
inhours.loc[np.isnan(inhours.POP), 'NAME'].value_counts()

<br>
<br>

### Fix the date

In [None]:
inhours.loc[:, 'date'] = pd.to_datetime(inhours.lastdate, unit='D', origin='unix')

In [None]:
inhours.date.describe(datetime_is_numeric=True)

<br>
<br>

### Trim the years

In [None]:
inhours.loc[:, 'year'] = inhours.date.dt.year

In [None]:
inhours.year.value_counts()

In [None]:
inhours.shape

<br>

Reduce

In [None]:
inhours = inhours.loc[inhours.year != 2020, :]

In [None]:
inhours.year.value_counts()

In [None]:
inhours.shape

<br>
<br>

### Drop zero populations

In [None]:
inhours = inhours.loc[inhours.POP != 0, :]

In [None]:
inhours = inhours.loc[~np.isnan(inhours.POP), :]

<br>
<br>

### Final clean data

In [None]:
inhours

<br>
<br>

## North South Divide


### Read the table

In [None]:
connection = sqlite.connecting()
query = 'SELECT * FROM localauth'
localauth = pd.read_sql_query(query, connection)
connection.close()

In [None]:
localauth.info()

In [None]:
localauth.head()

<br>

### Merge with GP data

In [None]:
inhoursNS = inhours.merge(localauth, how='left', on='NAME')

In [None]:
inhoursNS

<br>
<br>

### Group and aggregate

In [None]:
fields = ['NS', 'year', 'POP', 'Flu_OBS', 'Vom_OBS', 'Diarr_OBS', 'Gastro_OBS']

In [None]:
# Either
# inhoursNS[fields].groupby(by=['NS', 'year']).agg('sum')

# Or
aggNS = inhoursNS[fields].groupby(by=['NS', 'year']).agg(Total=('POP', sum), 
                                                         Flu_OBS=('Flu_OBS', sum), 
                                                         Vom_OBS=('Vom_OBS', sum), 
                                                         Diarr_OBS=('Diarr_OBS', sum), 
                                                         Gastro_OBS=('Gastro_OBS', sum))
aggNS

<br>
<br>

### Consultation rates

In [None]:
classes = ['Flu_OBS', 'Vom_OBS', 'Diarr_OBS', 'Gastro_OBS']
rates =  100000 * np.true_divide(aggNS[classes], aggNS[['Total']])
aggNS.loc[:, ['Flu_rate', 'Vom_rate', 'Diarr_rate', 'Gastro_rate']] = rates.to_numpy()

In [None]:
aggNS[['Flu_rate', 'Vom_rate', 'Diarr_rate', 'Gastro_rate']]

<br>
<br>

### Plot the Rates

In [None]:
aggNSTable = aggNS.reset_index()
aggNSTable.head()

In [None]:
sns.set(font_scale=0.9)
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(7, 5))

plt.subplots_adjust(left=0.20, bottom=0.2, right=0.9, top=0.9, hspace=0.5, wspace=0.5)

g = sns.lineplot(x='year', y='Flu_rate', hue='NS', data=aggNSTable, ax=axes[0, 0])
g.legend(title='Flu', loc='center left')
g.set_xlabel('')

g = sns.lineplot(x='year', y='Vom_rate', hue='NS', data=aggNSTable, ax=axes[0, 1])
g.legend(title='Vom')
g.set_xlabel('')

sns.lineplot(x='year', y='Diarr_rate', hue='NS', data=aggNSTable, ax=axes[1, 0]);
sns.lineplot(x='year', y='Gastro_rate', hue='NS', data=aggNSTable, ax=axes[1, 1]);


<br>
<br>

## Influenza Data


### Grouping and aggregating

The total populations and total influenza consultations per data

In [None]:
flu = inhours.groupby(by='date').agg(Total=('POP', sum), Flu_OBS=('Flu_OBS', sum))

<br>

### Compute rates

The influenza consultation rate per 100,000

In [None]:
flu.loc[:, 'Flu_rate'] = 100000 * np.true_divide(flu.Flu_OBS, flu.Total)

<br>

Each date's year

In [None]:
flu.loc[:, 'year'] = flu.index.year.values

<br>

Finally

In [None]:
flu

<br>
<br>

### Plot National Weekly Rates

In [None]:
flu.plot(y='Flu_rate', kind='line', figsize=(4.1, 2.6), 
         xlabel='', ylabel='influenza rate', title='\nNational Weekly Influenza Rates\n')
plt.xticks(rotation=90);
plt.legend(labels=['influenza rate']).set_visible(False);

<br>
<br>

## Season Start Detection


### Finding the minimum

<br>

The `when_min` function, which returns the index of the minimum value of a series - the last occurrence of the minimum value.

In [None]:
def when_min(x: pd.Series):
    """
    
    :param x: a time series whose index is a date index
    :return: the last occurrence of the minimum value of a series
    """
    
    return x.where(x==x.min()).last_valid_index()

<br>

Re-ascertaining that the index is sorted in ascending order.

In [None]:
flu.sort_index(inplace = True)

<br>

Hence, the date of each year's minimum value

In [None]:
minimum_value_dates = flu.groupby(by='year').agg(Flu_rate=('Flu_rate', when_min))
minimum_value_dates

<br>
<br>

### Simple Season Start Detection

In [None]:
def season_start(x: pd.Series):
    """
    
    :param x: a time series whose index is a date index
    :return: the first time a value beyond the minimum point exceeds (2 * minimum point value)
    """
    
    # the index of the minimum point of x - last occurence
    minimum_point_index = when_min(x = x)
    
    # the minimum point value
    minimum_point_value = x[minimum_point_index]
    
    # the values of x beyond the minim point
    beyond_minimum_point = x.iloc[ x.index > minimum_point_index ]
    
    # the indices beyond the minimum point that have values > (2 * minimum point value)
    indices = beyond_minimum_point.index[beyond_minimum_point > 2 * minimum_point_value]
    
    # the first time a value beyond the minimum point exceeds (2 * minimum point value)
    return indices.min()    
    
    

In [None]:
season_start_dates = flu.groupby(by='year').agg(Season_start=('Flu_rate', season_start))
season_start_dates