In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = None

### Data Import

In [2]:
# Dengue cases in 2023
dengue_2023 = pd.read_csv('../datasets/dengue_cases/2023 weekly-infectious-bulletin_cases.csv', header=1)
# Dengue cases in 2022
dengue_2022 = pd.read_csv('../datasets/dengue_cases/2022 weekly-infectious-bulletin_cases.csv', header=1)
# Dengue cases in 2021
dengue_2021 = pd.read_csv('../datasets/dengue_cases/2021 weekly-infectious-bulletin_cases.csv')
# Dengue cases in 2020
dengue_2020 = pd.read_csv('../datasets/dengue_cases/2020 weekly-infectious-bulletin_cases.csv', header=1)
# Dengue cases in 2019
dengue_2019 = pd.read_csv('../datasets/dengue_cases/2019 weekly-infectious-bulletin_cases.csv', header=1)
# Dengue cases in 2018
dengue_2018 = pd.read_csv('../datasets/dengue_cases/2018 weekly-infectious-bulletin_cases.csv', header=1)
# Dengue serology data in 2023
dengue_serotypes_2023 = pd.read_csv('../datasets/dengue_serology_data/2023_dengue_serotypes.csv')
# Dengue serology data in 2022
dengue_serotypes_2022 = pd.read_csv('../datasets/dengue_serology_data/2022_dengue_serotypes.csv')
# Dengue serology data in 2023
dengue_serotypes_2021 = pd.read_csv('../datasets/dengue_serology_data/2021_dengue_serotypes.csv')
# Dengue serology data in 2023
dengue_serotypes_2020 = pd.read_csv('../datasets/dengue_serology_data/2020_dengue_serotypes.csv')
# Dengue serology data in 2023
dengue_serotypes_2019 = pd.read_csv('../datasets/dengue_serology_data/2019_dengue_serotypes.csv')
# Dengue serology data in 2023
dengue_serotypes_2018 = pd.read_csv('../datasets/dengue_serology_data/2018_dengue_serotypes.csv')

### Functions for cleaning

In [3]:
# Function used to clean dengue data for infections before 2022
def clean_dengue_data(dengue):

    # Rename the columns we're using
    dengue.rename(columns={'Start' : 'date', 
                        'Dengue' : 'dengue_cases'}, 
                       inplace=True)
    
    # Drop the columns we're not using
    dengue = dengue[['date', 'dengue_cases']]
    
    # Convert the date to dateTime format
    dengue['date'] = pd.to_datetime(dengue['date'], dayfirst=True)
    
    return dengue

In [4]:
# Function used to clean dengue data for infections 2022 onwards as MoH changed the CSV format
def clean_dengue_data_2(dengue):

    # Rename the columns we're using
    dengue.rename(columns={'Unnamed: 1' : 'old_date', 
                        'Dengue' : 'dengue_cases'}, 
                       inplace=True)
    
    # Drop the columns we're not using
    dengue = dengue[['old_date', 'dengue_cases']]
    
    # Slice the weekly rangte from dd/mm/yyyy - dd/mm/yyyy format to just the 
    # first date of the range (dd/mm/yyyy)
    dengue['date'] = dengue['old_date'].str.split(pat='-')
    for week in range(len(dengue['date'])):
        dengue['date'][week] = dengue['date'][week][0]
    
    # Convert the date to dateTime format
    dengue['date'] = pd.to_datetime(dengue['date'], dayfirst=True)
    
    # Drop the old_date
    dengue = dengue[['date', 'dengue_cases']]
    
    # Drop the weeks that do not have data (April onwards)
    dengue.dropna(inplace=True)
    
    # Change dengue_cases to int format
    dengue['dengue_cases'] = dengue['dengue_cases'].astype(int)
    
    return dengue

### Clean Dengue Cases Data

In [5]:
# Clean the yearly dengue datasets
dengue_2023 = clean_dengue_data_2(dengue_2023)
dengue_2022 = clean_dengue_data_2(dengue_2022)
dengue_2021 = clean_dengue_data(dengue_2021)
dengue_2020 = clean_dengue_data(dengue_2020)
dengue_2019 = clean_dengue_data(dengue_2019)
dengue_2018 = clean_dengue_data(dengue_2018)

# Error below is due to chain indexing. Not sure how to solve it, but the output comes out fine. 
# Solve it next time if got time

# Can't get the for loop to work. i know im making some silly mistake here..
'''
list_of_dengue_years = [dengue_2018, dengue_2019, dengue_2020, dengue_2021]

for year in list_of_dengue_years:
    year = clean_dengue_data(year)
'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dengue['date'] = dengue['old_date'].str.split(pat='-')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dengue['date'][week] = dengue['date'][week][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dengue['date'][week] = dengue['date'][week][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

'\nlist_of_dengue_years = [dengue_2018, dengue_2019, dengue_2020, dengue_2021]\n\nfor year in list_of_dengue_years:\n    year = clean_dengue_data(year)\n'

### Merge dengue cases and serotype information

In [6]:
# Function for merging

def merge_dengue_data(dengue_cases, dengue_serotypes):
    # Create a month column to use as the common column for merging
    dengue_cases['month'] = dengue_cases['date'].dt.month
    
    # Merge using the month column
    dengue_cases = dengue_cases.merge(dengue_serotypes, on='month')
    
    # Drop the month column
    dengue_cases.drop(columns='month', inplace=True)
    
    return dengue_cases

In [7]:
# Merge the dengue cases and dengue serotypes dataframes
dengue_2023 = merge_dengue_data(dengue_2023, dengue_serotypes_2023)
dengue_2022 = merge_dengue_data(dengue_2022, dengue_serotypes_2022)
dengue_2021 = merge_dengue_data(dengue_2021, dengue_serotypes_2021)
dengue_2020 = merge_dengue_data(dengue_2020, dengue_serotypes_2020)
dengue_2019 = merge_dengue_data(dengue_2019, dengue_serotypes_2019)
dengue_2018 = merge_dengue_data(dengue_2018, dengue_serotypes_2018)


### Concatenate the yearly data into a single dataframe

In [8]:
# Use concat to combine all the dengue dataframes together
dengue = pd.concat([dengue_2023, dengue_2022, dengue_2021, dengue_2020, 
                   dengue_2019, dengue_2018], ignore_index=True)

In [9]:
# Check that the concatination is successful
dengue.shape

(273, 6)

In [10]:
# Resetting the index
dengue.set_index('date', inplace=True)

### Export the dengue data

In [12]:
# Export the dengue data into a csv file
dengue.to_csv('../datasets/dengue_data_cleaned.csv', index=True)