# Data Wrangling for organizations.csv
This notebook processes the `organizations.csv` by performing the following steps:
- WE JUST NEED TO RENAME COLUMNS HERE

In [1]:
import pandas as pd
import numpy as np
import yaml
from pprint import pprint

from IPython.display import display, JSON

In [2]:
# Standardizing Column Names
def func_rename_and_cast_columns(df, column_mappings):
    '''
    Applies column_mappings to the columns of input dataframe

    df: Input dataframe
    column_mappings: dictionary containing mappings for column names        
    '''
    list_unnormalised_colnames = df.columns.to_list()
    for unnormalised_colname in list_unnormalised_colnames:
        normalised_colname = column_mappings[unnormalised_colname]['normalised_colname']
        col_type = column_mappings[unnormalised_colname]['type']
        print(f"renaming column: {unnormalised_colname} to {normalised_colname} and casting type to: {col_type}")
        df.rename(columns={unnormalised_colname: normalised_colname}, inplace=True)
        if col_type == 'str':
            # Fill NAs and empty strings to "unknowns"
            if df[normalised_colname].isnull().any() or df[normalised_colname].isna().any():
                print(f"column: {normalised_colname} contains null values thus filling with unknown")
                df[normalised_colname] = df[normalised_colname].fillna("Unknown")
            df[normalised_colname] = df[normalised_colname].astype(str)
            print(f"{normalised_colname} casted to type(str)")
        if col_type == 'datetimestamp':
            # df[normalised_colname] = pd.to_datetime(df[normalised_colname], errors='coerce')
            df[normalised_colname] = pd.to_datetime(df[normalised_colname], errors='coerce') 
            df[normalised_colname] = df[normalised_colname].dt.tz_localize(None) # Retaining source timeformat
            print(f"{normalised_colname} casted to datetime")
        if col_type == 'numeric':
            df[normalised_colname] = pd.to_numeric(df[normalised_colname], errors='coerce')
            print(f"{normalised_colname} casted to numeric")
        print('-'*5)
    
    return df 

In [3]:
# Load the dataset
filepath_csv = '../../data/raw_data/organizations.csv' # Read CSV 
output_path = '../../data/processed_data/processed_organizations.csv' # Write processed CSV
filepath_yaml = '../../config/organizations.yaml' # Read encounters.yaml, it is used to clean column names and apply relevant types to columns

# Load Dataframe
df = pd.read_csv(filepath_csv)
# Load YAML column mappings 
with open(filepath_yaml, "r") as file:
    dict_column_mappings = yaml.safe_load(file)
    
# Display initial dataset information
print('Initial Dataset Info:')
df.info()
df.head(5)

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           272 non-null    object 
 1   NAME         272 non-null    object 
 2   ADDRESS      272 non-null    object 
 3   CITY         272 non-null    object 
 4   STATE        272 non-null    object 
 5   ZIP          272 non-null    int64  
 6   LAT          272 non-null    float64
 7   LON          272 non-null    float64
 8   PHONE        272 non-null    object 
 9   REVENUE      272 non-null    float64
 10  UTILIZATION  272 non-null    int64  
dtypes: float64(3), int64(2), object(6)
memory usage: 23.5+ KB


Unnamed: 0,Id,NAME,ADDRESS,CITY,STATE,ZIP,LAT,LON,PHONE,REVENUE,UTILIZATION
0,74ab949d-17ac-3309-83a0-13b4405c66aa,Fitchburg Outpatient Clinic,881 Main Street,Fitchburg,MA,1420,42.586487,-71.80521,978-342-9781 Or 978-342-9781,0.0,66
1,faffaf6a-ee1a-3673-b0b0-421a9c249244,ACTIVATED BY WELLNESS LLC,66 WASHINGTON ST,STOUGHTON,MA,20722571,42.144158,-71.103783,6178719807,0.0,34
2,17a4bae5-8b64-34d7-8144-b428be027bd0,NURSE ON CALL,512 MAIN STREET SUITE 211,SHREWSBURY,MA,15456406,42.29511,-71.718085,5088451232,0.0,5
3,e09d4c49-c2ef-3b0f-9a46-3719d9219306,UMASS MEMORIAL HEALTHALLIANCE CLINTON HOSPITAL...,60 HOSPITAL RD,LEOMINSTER,MA,14533290,42.540319,-71.76313,9784662000,0.0,30
4,c241b977-4131-32e4-9957-e0a00b2a1e5f,DUFFY HEALTH CENTER,94 MAIN STREET,HYANNIS,MA,26013146,41.65628,-70.27448,5087719599,0.0,1


In [4]:
# pprint(dict_column_mappings)
display(JSON(dict_column_mappings))

<IPython.core.display.JSON object>

# Data Cleaning and Data Quality Checks

1. Just rename columns here

In [5]:
df = func_rename_and_cast_columns(df, dict_column_mappings['columns'])
df.head(5)

renaming column: Id to organization_id and casting type to: str
organization_id casted to type(str)
-----
renaming column: NAME to organization_name and casting type to: str
organization_name casted to type(str)
-----
renaming column: ADDRESS to address and casting type to: str
address casted to type(str)
-----
renaming column: CITY to city and casting type to: str
city casted to type(str)
-----
renaming column: STATE to state and casting type to: str
state casted to type(str)
-----
renaming column: ZIP to zip and casting type to: str
zip casted to type(str)
-----
renaming column: LAT to lat and casting type to: float
-----
renaming column: LON to lon and casting type to: float
-----
renaming column: PHONE to phone and casting type to: str
phone casted to type(str)
-----
renaming column: REVENUE to revenue and casting type to: float
-----
renaming column: UTILIZATION to utilization and casting type to: str
utilization casted to type(str)
-----


Unnamed: 0,organization_id,organization_name,address,city,state,zip,lat,lon,phone,revenue,utilization
0,74ab949d-17ac-3309-83a0-13b4405c66aa,Fitchburg Outpatient Clinic,881 Main Street,Fitchburg,MA,1420,42.586487,-71.80521,978-342-9781 Or 978-342-9781,0.0,66
1,faffaf6a-ee1a-3673-b0b0-421a9c249244,ACTIVATED BY WELLNESS LLC,66 WASHINGTON ST,STOUGHTON,MA,20722571,42.144158,-71.103783,6178719807,0.0,34
2,17a4bae5-8b64-34d7-8144-b428be027bd0,NURSE ON CALL,512 MAIN STREET SUITE 211,SHREWSBURY,MA,15456406,42.29511,-71.718085,5088451232,0.0,5
3,e09d4c49-c2ef-3b0f-9a46-3719d9219306,UMASS MEMORIAL HEALTHALLIANCE CLINTON HOSPITAL...,60 HOSPITAL RD,LEOMINSTER,MA,14533290,42.540319,-71.76313,9784662000,0.0,30
4,c241b977-4131-32e4-9957-e0a00b2a1e5f,DUFFY HEALTH CENTER,94 MAIN STREET,HYANNIS,MA,26013146,41.65628,-70.27448,5087719599,0.0,1


In [6]:
# Check for duplicate rows
print(f"Total length of dataframe BEFORE removing duplicates: {len(df)}")
df = df.drop_duplicates()
print(f"Total length of dataframe AFTER removing duplicates: {len(df)}")


Total length of dataframe BEFORE removing duplicates: 272
Total length of dataframe AFTER removing duplicates: 272


In [7]:
# Save the cleaned and transformed dataset
df.to_csv(output_path, index=False)
print(f'Processed data saved to {output_path}')

Processed data saved to ../../data/processed_data/processed_organizations.csv
