# Data Wrangling for encounters.csv
This notebook processes the `encounters.csv` by performing the following steps:
- Inspecting and cleaning the data
- Handling missing values and inconsistencies
- Transforming the data (e.g., creating new features)
- Saving the processed data to a new file

In [None]:
import pandas as pd
import numpy as np
import yaml

In [None]:
# Standardizing Column Names
def func_rename_and_cast_columns(df, column_mappings):
    '''
    Applies column_mappings to the columns of input dataframe

    df: Input dataframe
    column_mappings: dictionary containing mappings for column names        
    '''
    list_unnormalised_colnames = df.columns.to_list()
    for unnormalised_colname in list_unnormalised_colnames:
        normalised_colname = column_mappings[unnormalised_colname]['normalised_colname']
        col_type = column_mappings[unnormalised_colname]['type']
        print(f"renaming column: {unnormalised_colname} to {normalised_colname} and casting type to: {col_type}")
        df.rename(columns={unnormalised_colname: normalised_colname}, inplace=True)
        if col_type == 'str':
            # Fill NAs and empty strings to "unknowns"
            if df[normalised_colname].isnull().any() or df[normalised_colname].isna().any():
                print(f"column: {normalised_colname} contains null values thus filling with unknown")
                df[normalised_colname] = df[normalised_colname].fillna("Unknown")
            df[normalised_colname] = df[normalised_colname].astype(str)
            print(f"{normalised_colname} casted to type(str)")
        if col_type == 'datetimestamp':
            # df[normalised_colname] = pd.to_datetime(df[normalised_colname], errors='coerce')
            df[normalised_colname] = pd.to_datetime(df[normalised_colname], errors='coerce') 
            df[normalised_colname] = df[normalised_colname].dt.tz_localize(None) # Retaining source timeformat
            print(f"{normalised_colname} casted to datetime")
        if col_type == 'numeric':
            df[normalised_colname] = pd.to_numeric(df[normalised_colname], errors='coerce')
            print(f"{normalised_colname} casted to numeric")
        print('-'*5)
    
    return df 

In [None]:
# Load the dataset
filepath_csv = '../../data/raw_data/encounters.csv' # Read CSV 
filepath_processed_csv = '../../data/processed_data/processed_encounters.csv' # Write processed CSV
filepath_yaml = '../../config/encounters.yaml' # Read encounters.yaml, it is used to clean column names and apply relevant types to columns

# Load Dataframe
df_encounters = pd.read_csv(filepath_csv)
# Load YAML column mappings 
with open(filepath_yaml, "r") as file:
    dict_column_mappings = yaml.safe_load(file)
    
# Display initial dataset information
print('Initial Dataset Info:')
df_encounters.info()
df_encounters.head(5)

# Data Cleaning and Data Quality Checks

1. Standardise column names
2. Apply relevant types
3. Fill null values with relevant values
4. Perform the logical testing (start_time < end_time)

In [None]:
df_encounters = func_rename_and_cast_columns(df_encounters, dict_column_mappings['encounters'])
