In [5]:
import pandas as pd
import re
from utils import data_path
# specify the path to save the intermediate data
intermediate_data_path = data_path() + '\\intermediate'+'\\intermediate_data.parquet'

# Save the intermediate data to a parquet file
intermediate_data = pd.read_parquet(intermediate_data_path)

# Data Transformation and Feature creation

1. **Creating data features**
   - unique identifier (using policy number, member code, age, and gender)
   - numerical to categorical
      - age -> using 10 year intervals
      - bmi_cat -> logical groups:
         - underweight [<18.5], 
         - healthy [18.5 to <25], 
         - overweight [25 to <30], 
         - obesity [=> 30]
   - Creating an obesity_cat  
      - class 1 [30 to <35]
      - class 2 [35 to <40]
      - class 3 [=>40]
   - Major city (top 5 city by unique identifier count)

2. **Normalization Process:**
Cleaning city names documentation

The cleaning city names stage focuses on ensuring consistency and standardization in the dataset by processing city names. This involves handling variations in formatting, capitalization, and spelling. Consistent city names are crucial for accurate analysis, grouping, and reporting.

   - A function `normalize_city_name` is created to handle city name inconsistencies. This function:
     - Strips leading and trailing whitespace.
     - Converts names to title case (e.g., "new york" becomes "New York").
     - Properly formats hyphenated names (e.g., "los-angeles" becomes "Los-Angeles").
     - Create a lookup table and store it for reuse in downstream processes and documentation purposes.

3. **Converting Claims type into indicator**
   -  A fucntion `convert_to_indicator_and_remove_duplicates` is created to handle the conversion of the claim type with 'I' values to and indicator and removing the duplicate rows.

4. **Extracting Feature Tables**
   - creating diabetes type, comorbidity, and diabetes feature tables by unique_id
   - create fanmily size table
   - unique identifier table



In [None]:
# Creating categorical variables for `age` and `bmi`

## Feature Engineering: Age and BMI Groups

# Create age groups
intermediate_data['age_cat'] = pd.cut(
    intermediate_data['age'],
    bins=range(0, 121, 10),  # 10-year intervals
    labels=[f"{i}-{i+9}" for i in range(0, 120, 10)],
    right=False
)

# Create BMI categories
intermediate_data['bmi_cat'] = pd.cut(
    intermediate_data['bmi'],
    bins=[-float('inf'), 18.5, 25, 30, float('inf')],
    labels=['Underweight', 'Healthy', 'Overweight', 'Obesity'],
    right=False
)

# Create Obesity categories
intermediate_data['obesity_cat'] = pd.cut(
    intermediate_data['bmi'],
    bins=[-float('inf'), 30, 35, 40, float('inf')],
    labels=['Not Obese','Class 1 Obesity', 'Class 2 Obesity', 'Class 3 Obesity'],
    right=False
)

# Check the distribution of the newly created categories
print("Age Categories Distribution:")
print(intermediate_data['age_cat'].value_counts())

print("BMI Categories Distribution:")
print(intermediate_data['bmi_cat'].value_counts())

print("Obesity Categories Distribution:")
print(intermediate_data['obesity_cat'].value_counts())


In [None]:

# assigning unique ID to individuals

def assign_unique_id(intermediate_data):
    """
    Assigns a unique ID to individuals by grouping based on policy_number, member_code, gender, and age.

    Parameters:
    intermediate_data (DataFrame): The DataFrame containing the intermediate data.

    Returns:
    DataFrame: The DataFrame with an additional column 'unique_id'.
    """
    intermediate_data['unique_id'] = (
        intermediate_data.groupby(['policy_number', 'member_code', 'gender', 'age'],  observed=True)
        .ngroup()
        .astype(str)  # Convert the unique ID to a string

    )
    return intermediate_data

# Assign a unique ID for individuals by grouping
intermediate_data = assign_unique_id(intermediate_data)


In [None]:
# Function to normalize city names
def normalize_city_name(name):
    """
    Normalize city names to ensure consistent formatting.

    Parameters:
        name (str): The original city name.

    Returns:
        str: The cleaned and normalized city name.
        - Strips leading and trailing whitespace.
        - Capitalizes the first letter of each word and handles hyphens appropriately.
    """
    name = name.strip()  # Strip leading and trailing whitespace
    # Split by spaces and hyphens, capitalize each part, and rejoin
    parts = re.split(r'(\s+|-)', name)
    name = ''.join(part.capitalize() if part.isalpha() else part for part in parts)
    return name


# Normalize city names in the raw_data DataFrame
intermediate_data['clean_city'] = intermediate_data['city'].apply(normalize_city_name)

# Create a lookup table
city_lookup = (
    intermediate_data.groupby('clean_city')['city']
    .unique()
    .reset_index()
    .rename(columns={'city': 'list_of_variants'})
)

# Save the lookup table to a CSV file
city_lookup.to_csv('..\\data\\lookup\\city_lookup.csv', index=False)

# Verify the lookup table
print("\nCity Lookup Table:\n", city_lookup)


In [None]:

def convert_to_indicator_and_remove_duplicates(data, column):
    """
    Convert a specified column to indicator columns and remove redundant duplicate rows.

    Parameters:
    data (DataFrame): The input DataFrame containing the data.
    column (str): The column name to convert to indicator columns.

    Returns:
    DataFrame: The updated DataFrame with indicator columns and duplicates removed.
    """
    data = data.copy()
    # Create indicator columns for the specified column values
    unique_values = data[column].unique()
    for value in unique_values:
        indicator_column_name = f"{column}_{value}"
        data[indicator_column_name] = (data[column] == value).astype(int)

    # Remove the original column
    data = data.drop(columns=[column])

    # Drop duplicate rows based on all columns except the new indicator columns
    common_columns = [col for col in data.columns if not col.startswith(f"{column}_")]
    data = data.drop_duplicates(subset=common_columns)

    return data


# Convert the 'claim_type' column to indicator columns and remove redundant duplicate rows
intermediate_data_converted_data = convert_to_indicator_and_remove_duplicates(intermediate_data, 'claim_type').drop(columns=['claim_type_O'])

# Print the updated DataFrame
print("Updated DataFrame with claim_type indicators and duplicates removed:")
display(intermediate_data_converted_data)

In [None]:
# creating the icd code lookup table
icd_lookup = intermediate_data[['icd_code', 'icd_description']].drop_duplicates()
icd_lookup.to_csv('..\\data\\lookup\\icd_lookup.csv', index=False) 

# creating the diabetes type, diabetes complication, and comorbidity feature tables
intermediate_data['icd_code_major'] = intermediate_data['icd_code'].str.split('.').str[0]
disease_table = intermediate_data[['unique_id', 'icd_code_major', 'icd_code']].drop_duplicates()
major_disease_table = disease_table[['unique_id', 'icd_code_major']].drop_duplicates()

# diabetes type
diabetes_type_table = major_disease_table[major_disease_table['icd_code_major'].str.contains('E')]
diabetes_type_feature = diabetes_type_table.pivot_table(index='unique_id', columns='icd_code_major',  aggfunc='size', fill_value=0).reset_index()
diabetes_type_feature.columns.name = None  # Remove the columns name

# diabetes complications
complication_table = disease_table[['unique_id', 'icd_code']].drop_duplicates()
diabetes_complication_table = complication_table[complication_table['icd_code'].str.contains('E') & complication_table['icd_code'].str.contains('\.')]
diabetes_complication_feature = diabetes_complication_table.pivot_table(index='unique_id', columns='icd_code', aggfunc='size', fill_value=0).reset_index()
diabetes_complication_feature.columns.name = None  # Remove the columns name
diabetes_complication_feature['total_complications'] = diabetes_complication_feature.drop(columns=['unique_id']).sum(axis=1)

# comorbidities
comorbidity_type_table = major_disease_table[~major_disease_table['icd_code_major'].str.contains('E')]
comorbidity_feature = comorbidity_type_table.pivot_table(index='unique_id', columns='icd_code_major',  aggfunc='size', fill_value=0).reset_index()
comorbidity_feature.columns.name = None  # Remove the columns name
comorbidity_feature['total_comorbidities'] = comorbidity_feature.drop(columns=['unique_id']).sum(axis=1)

# save the feature tables
diabetes_type_feature.to_parquet('..\\data\\feature_store\\diabetes_type_feature.parquet', index=False)
diabetes_complication_feature.to_parquet('..\\data\\feature_store\\diabetes_complication_feature.parquet', index=False)
comorbidity_feature.to_parquet('..\\data\\feature_store\\comorbidity_feature.parquet', index=False)


In [None]:
# create fanmily size feature table
def create_family_size_feature(data):
    """
    Create a 'family_size' feature by the count of unique identifiers per 'policy_number' and 'member_code'.

    Parameters:
    data (DataFrame): The input DataFrame containing the data.

    Returns:
    DataFrame: The updated DataFrame with a 'family_size' feature.
    """
    family_size_data = data.groupby(['policy_number', 'member_code'])['unique_id'].nunique().reset_index(name='family_size')
    return family_size_data

# usage of the create_family_size_feature function
family_size_table = create_family_size_feature(intermediate_data_converted_data)
display(family_size_table)
family_size_table['family_size'].hist()
family_size_table['family_size'].value_counts()

# saving the family size table
family_size_table.to_parquet('..\\data\\feature_store\\family_size_table.parquet', index=False)

In [135]:
# create an identifier table
identifier_table = intermediate_data_converted_data[['unique_id', 'policy_number', 'member_code']].drop_duplicates()
identifier_table.to_parquet('..\\data\\feature_store\\identifier_table.parquet', index=False)


In [147]:
identifier_table

Unnamed: 0,unique_id,policy_number,member_code
0,3183,26730932,1961848012
8,3184,26730932,8238702512
12,3185,26730932,8638816522
13,3186,26730932,9534292522
15,3187,26730932,9762026522
...,...,...,...
173723,3309,26730932,279082000000000
173729,3310,26730932,284042000000000
173733,3311,26730932,288071000000000
173736,3312,26730932,288102000000000


In [143]:
# save the cleaned data to a parquet file
primary_data = intermediate_data_converted_data.drop(columns=['city'])
primary_data.rename(columns={'clean_city': 'city'}, inplace=True)

primary_data.to_parquet('..\\data\\primary\\primary_data.parquet', index=False)

# Other code not used

# Creating Features and storing them in the feature store

## Using encoding (to remove order)
use the funciton get_dummies for all cities, for major cities we replace all cities ranked >4 as other first
- City encoding 
- Top Major cities (4  + 'other') encoding
- 

In [156]:
# Group by 'city' and count the number of unique 'unique_id' values
unique_id_counts_by_city = primary_data.groupby('city')['unique_id'].nunique().reset_index()

# Rename the columns for clarity
unique_id_counts_by_city.columns = ['city', 'unique_id_count']

# Print the result
print(unique_id_counts_by_city.sort_values('unique_id_count', ascending=False))

# Get the top 5 cities by unique_id count
top_5_cities = unique_id_counts_by_city.nlargest(6, 'unique_id_count')['city']
# Filter out 'Unknown' from the top 5 cities if it exists
top_5_cities = top_5_cities[top_5_cities != 'Unknown']

# Create the 'major_city' variable
primary_data['major_city'] = primary_data['city'].apply(lambda x: x if x in top_5_cities.values else 'Other')

cities_table = primary_data[['city', 'major_city']].drop_duplicates()

# Print the updated DataFrame
cities_table

               city  unique_id_count
28           Jeddah             6697
46           Riyadh             5288
11         Alkhobar              889
59          Unknown              840
33           Madina              780
..              ...              ...
1              Afif                2
6        Al Khormah                1
40          Oyaynah                1
47  Riyadh Al-Kabra                1
58           Taroot                1

[64 rows x 2 columns]


Unnamed: 0,city,major_city
0,Riyadh,Riyadh
12,Unknown,Other
16,Jeddah,Jeddah
56,Alkhobar,Alkhobar
236,Makkah,Makkah
...,...,...
95960,Methnab,Other
108639,Ras Tannura,Other
112609,Quaieyyah,Other
152192,Oyaynah,Other


In [148]:
top_5_cities

28      JEDDAH
46      RIYADH
8     ALKHOBAR
33      MADINA
36      MAKKAH
Name: city, dtype: string