In [None]:
import pandas as pd
import janitor

from utils import data_path, list_data_files

# Loading Stage Documentation

### The loading stage focuses on ensuring data is correctly ingested with proper data types and meaningful column names.

**Best Practices Followed:**
1. **Defining Data Types:**
   - A `dtype_dict` is created to explicitly specify data types for each column in the dataset. This improves memory efficiency and ensures correct data interpretation.
   - Examples include treating `gender` as a categorical variable and `bmi` as a float for precise numerical analysis.

2. **Using a Column Lookup Table:**
   - A dictionary (`column_lookup`) is used to rename columns to more descriptive and meaningful names. This makes the dataset easier to understand and work with.

3. **Efficient Loading:**
   - The dataset is loaded using `pd.read_csv` with the `dtype_dict`, minimizing post-load type conversions and errors.

4. **Validation:**
   - The data types and the first few rows are printed to verify successful loading and renaming.



In [None]:
# Prin the list of data files
print(list_data_files())

# Define the path to the raw data file
raw_data_path =  data_path()+ "\\" + list_data_files()[1]

In [None]:
# Define the data types for each column in the dataset
dtype_dict = {
    "MEMBER_CODE": "int64",    # De-identified member ID, stored as float to match dataset format
    "Age": "int64",             # Age of the member
    "GENDER": "category",       # Gender is a categorical variable
    "POLICY_NO": "int64",       # Policy number, stored as integer
    "CMS_Score": "int64",       # Charlson comorbidity index score, stored as integer
    "ICD_CODE": "category",     # ICD-10 codes are categorical
    "ICD_desc": "string",       # ICD-10 description as a string
    "City": "string",           # City as a string, handling missing values separately
    "CLAIM_TYPE": "category",   # Claim type is categorical
    "BMI": "float64"            # BMI as a float
}

# Column renaming lookup table
column_lookup = {
    "MEMBER_CODE": "member_code",
    "Age": "age",
    "GENDER": "gender",
    "POLICY_NO": "policy_number",
    "CMS_Score": "cms_score",
    "ICD_CODE": "icd_code",
    "ICD_desc": "icd_description",
    "City": "city",
    "CLAIM_TYPE": "claim_type",
    "BMI": "bmi"
}


# Load the dataset with specified data types
raw_data = pd.read_csv(raw_data_path, dtype=dtype_dict)

# Rename columns using the lookup table
raw_data.rename(columns=column_lookup, inplace=True)

# Verify the data types after loading
print(raw_data.dtypes)

# Display the first few rows to confirm successful loading
print(raw_data.head())

# Handling Missing Data and Duplicates Documentation

### The second stage focuses on ensuring data integrity by handling missing values and duplicates.

**Best Practices Followed:**
1. **Identifying Missing Data:**
   - A summary of missing values is generated to identify columns with missing entries.

2. **Handling Missing Data:**
   - Missing values in the `city` column are replaced with "Unknown" as an example strategy.
   - Other strategies can include imputation or dropping rows/columns based on context.

3. **Checking for Duplicates:**
   - Duplicate rows are identified, counted, and removed to ensure data uniqueness and prevent bias.

4. **Validation:**
   - After handling missing data and duplicates, the data is re-checked to confirm integrity.


In [None]:
# Handle missing values
missing_summary = raw_data.isnull().sum()
print("Missing Values Summary:\n", missing_summary)

# Filling missing city values with 'Unknown'
raw_data["city"] = raw_data["city"].fillna("Unknown")

# Check and document duplicates
duplicate_count = raw_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Print duplicate rows if they exist
if duplicate_count > 0:
    print("\nDuplicate rows:\n", raw_data[raw_data.duplicated()])

# Remove duplicate rows
raw_data = raw_data.drop_duplicates()

# Verify changes after handling missing values and duplicates
print("\nData after handling missing values and duplicates:\n", raw_data.head())


Cleaning city names documentation

The cleaning city names stage focuses on ensuring consistency and standardization in the dataset by processing city names. This involves handling variations in formatting, capitalization, and spelling. Consistent city names are crucial for accurate analysis, grouping, and reporting.

**Best Practices Followed:**
1. **Normalization Process:**
   - A function `normalize_city_name` is created to handle city name inconsistencies. This function:
     - Strips leading and trailing whitespace.
     - Converts names to title case (e.g., "new york" becomes "New York").
     - Properly formats hyphenated names (e.g., "los-angeles" becomes "Los-Angeles").

2. **Creating a Lookup Table:**
   - After normalizing city names, a lookup table is generated. The structure of the table is as follows:
     - `clean_city`: The standardized city name.
     - `list_of_variants`: A list of raw or uncleaned city names that map to the standardized name.
   - This table ensures traceability and provides a reference for re-mapping in future processes.

3. **Exporting the Lookup Table:**
   - The lookup table is saved as a CSV file (`city_lookup.csv`) for reuse in downstream processes and documentation purposes.


**Benefits of This Approach:**
- Improved consistency in city-related analysis.
- Enhanced traceability with the `list_of_variants` column.
- Reusability of the lookup table across multiple datasets or reports.

---

In [None]:
import re

# Function to normalize city names
def normalize_city_name(name):
    """
    Normalize city names to ensure consistent formatting.

    Parameters:
        name (str): The original city name.

    Returns:
        str: The cleaned and normalized city name.
        - Strips leading and trailing whitespace.
        - Capitalizes the first letter of each word and handles hyphens appropriately.
    """
    name = name.strip()  # Strip leading and trailing whitespace
    # Split by spaces and hyphens, capitalize each part, and rejoin
    parts = re.split(r'(\s+|-)', name)
    name = ''.join(part.capitalize() if part.isalpha() else part for part in parts)
    return name


# Normalize city names in the raw_data DataFrame
raw_data['clean_city'] = raw_data['city'].apply(normalize_city_name)


# Create a lookup table
city_lookup = (
    raw_data.groupby('clean_city')['city']
    .unique()
    .reset_index()
    .rename(columns={'city': 'list_of_variants'})
)

# Save the lookup table to a CSV file
city_lookup.to_csv('city_lookup.csv', index=False)

# Verify the lookup table
print("\nCity Lookup Table:\n", city_lookup)


In [67]:
data_path()

'c:\\Users\\Raed\\Documents\\P_projects\\Lean_project\\data'

In [69]:
# save the cleaned data to a parquet file
intermediate_data = raw_data.drop(columns=['city'])

# specify the path to save the intermediate data
intermediate_data_path = data_path() + '\\intermediate'+'\\intermediate_data.parquet'

# Save the intermediate data to a parquet file
intermediate_data.to_parquet(intermediate_data_path, index=False)