In [1]:
import pandas as pd
import re
from utils import data_path
# specify the path to save the intermediate data
intermediate_data_path = data_path() + '\\intermediate'+'\\intermediate_data.parquet'

# Save the intermediate data to a parquet file
intermediate_data = pd.read_parquet(intermediate_data_path)

**Table of contents**<a id='toc0_'></a>    
- 1. [Data Cleaning ](#toc1_)    
  - 1.1. [Convert the claim_type to indicator and remvoving the duplicate value](#toc1_1_)    
  - 1.2. [Assigning unique identifiers](#toc1_2_)    
  - 1.3. [Standarizing city names](#toc1_3_)    
- 2. [Feature engineering](#toc2_)    
  - 2.1. [Feature engineering (new variables)](#toc2_1_)    
    - 2.1.1. [Age, BMI and Obesity categories](#toc2_1_1_)    
    - 2.1.2. [Unique identifiers](#toc2_1_2_)    
  - 2.2. [Feature engineering (new tables)](#toc2_2_)    
    - 2.2.1. [Diabetes type feature table](#toc2_2_1_)    
    - 2.2.2. [Diabetes complications feature table](#toc2_2_2_)    
    - 2.2.3. [Comorbidity feature table](#toc2_2_3_)    
    - 2.2.4. [Family size feature table](#toc2_2_4_)    
    - 2.2.5. [Unique identifier feature table](#toc2_2_5_)    
- 3. [Create Lookup Tables](#toc3_)    
  - 3.1. [Standarized city name lookup table](#toc3_1_)    
  - 3.2. [ICD 10 code lookup table](#toc3_2_)    
- 4. [Saving the primary data](#toc4_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. <a id='toc1_'></a>[Data Cleaning](#toc1_)  [&#8593;](#toc0_)

## 1.1. <a id='toc1_1_'></a>[Convert the claim_type to indicator and remvoving the duplicate value](#toc0_)

In [None]:

def convert_to_indicator_and_remove_duplicates(data, column):
    """
    Convert a specified column to indicator columns and remove redundant duplicate rows.

    Parameters:
    data (DataFrame): The input DataFrame containing the data.
    column (str): The column name to convert to indicator columns.

    Returns:
    DataFrame: The updated DataFrame with indicator columns and duplicates removed.
    """
    data = data.copy()
    # Create indicator columns for the specified column values
    unique_values = data[column].unique()
    for value in unique_values:
        indicator_column_name = f"{column}_{value}"
        data[indicator_column_name] = (data[column] == value).astype(int)

    # Remove the original column
    data = data.drop(columns=[column])

    # Drop duplicate rows based on all columns except the new indicator columns
    common_columns = [col for col in data.columns if not col.startswith(f"{column}_")]
    data = data.drop_duplicates(subset=common_columns)

    return data


# Convert the 'claim_type' column to indicator columns and remove redundant duplicate rows
intermediate_data = convert_to_indicator_and_remove_duplicates(intermediate_data, 'claim_type').drop(columns=['claim_type_O'])

# Print the updated DataFrame
print("Updated DataFrame with claim_type indicators and duplicates removed:")
display(intermediate_data)

## 1.2. <a id='toc1_2_'></a>[Assigning unique identifiers](#toc0_)

In [3]:

# assigning unique ID to individuals

def assign_unique_id(intermediate_data):
    """
    Assigns a unique ID to individuals by grouping based on policy_number, member_code, gender, and age.

    Parameters:
    intermediate_data (DataFrame): The DataFrame containing the intermediate data.

    Returns:
    DataFrame: The DataFrame with an additional column 'unique_id'.
    """
    intermediate_data['unique_id'] = (
        intermediate_data.groupby(['policy_number', 'member_code', 'gender', 'age'],  observed=True)
        .ngroup()
        .astype(str)  # Convert the unique ID to a string

    )
    return intermediate_data

# Assign a unique ID for individuals by grouping
intermediate_data = assign_unique_id(intermediate_data)

In [None]:
intermediate_data.nunique()

## 1.3. <a id='toc1_3_'></a>[Standarizing city names](#toc0_)

In [6]:
# Function to standarize city names
def standarize_city_name(name):
    """
    standarize city names to ensure consistent formatting.

    Parameters:
        name (str): The original city name.

    Returns:
        str: The cleaned and standarized city name.
        - Strips leading and trailing whitespace.
        - Capitalizes the first letter of each word and handles hyphens appropriately.
    """
    name = name.strip()  # Strip leading and trailing whitespace
    # Split by spaces and hyphens, capitalize each part, and rejoin
    parts = re.split(r'(\s+|-)', name)
    name = ''.join(part.capitalize() if part.isalpha() else part for part in parts)
    return name


# Normalize city names in the raw_data DataFrame
intermediate_data['clean_city'] = intermediate_data['city'].apply(standarize_city_name)



# 2. <a id='toc2_'></a>[Feature engineering](#toc0_)

## 2.1. <a id='toc2_1_'></a>[Feature engineering (new variables)](#toc0_)

### 2.1.1. <a id='toc2_1_1_'></a>[Age, BMI and Obesity categories](#toc0_)

In [None]:
# Feature Engineering: Age and BMI Categories
age_bins = range(0, 121, 10)
age_labels = [f"{i}-{i+9}" for i in range(0, 120, 10)]
bmi_bins = [-float('inf'), 18.5, 25, 30, float('inf')]
bmi_labels = ['Underweight', 'Healthy', 'Overweight', 'Obesity']
obesity_bins = [-float('inf'), 30, 35, 40, float('inf')]
obesity_labels = ['Not Obese', 'Class 1 Obesity', 'Class 2 Obesity', 'Class 3 Obesity']

# Create categorical columns
intermediate_data['age_cat'] = pd.cut(intermediate_data['age'], bins=age_bins, labels=age_labels, right=False)
intermediate_data['bmi_cat'] = pd.cut(intermediate_data['bmi'], bins=bmi_bins, labels=bmi_labels, right=False)
intermediate_data['obesity_cat'] = pd.cut(intermediate_data['bmi'], bins=obesity_bins, labels=obesity_labels, right=False)

# Print category distributions
for col in ['age_cat', 'bmi_cat', 'obesity_cat']:
    display(f"{col} Distribution:")
    display(intermediate_data[col].value_counts())
    display()


### 2.1.2. <a id='toc2_1_2_'></a>[Unique identifiers](#toc0_)

In [8]:
# Count the number of unique 'unique_id' values per 'clean_city'
unique_id_counts_by_city = (intermediate_data.groupby('clean_city')['unique_id']
                            .nunique()
                            .reset_index(name='unique_id_count')
                            .sort_values('unique_id_count', ascending=False))

# Get the top 5 cities by unique_id count, excluding 'Unknown'
top_5_cities = unique_id_counts_by_city.loc[unique_id_counts_by_city['clean_city'] != 'Unknown', 'clean_city'].head(5)

# Create the 'major_city' variable
intermediate_data['major_city'] = intermediate_data['clean_city'].apply(
    lambda x: x if x in top_5_cities.values else 'Other'
)

## 2.2. <a id='toc2_2_'></a>[Feature engineering (new tables)](#toc0_)

In [9]:
# Define a function to generate feature tables
def generate_feature_table(data, condition, group_by_columns, agg_func='size'):
    filtered_data = data[condition]
    return filtered_data.pivot_table(index='unique_id', columns=group_by_columns, aggfunc=agg_func, fill_value=0).reset_index()


### 2.2.1. <a id='toc2_2_1_'></a>[Diabetes type feature table](#toc0_)




In [10]:
# Create disease-related feature tables
intermediate_data['icd_code_major'] = intermediate_data['icd_code'].str.split('.').str[0]
disease_table = intermediate_data[['unique_id', 'icd_code_major', 'icd_code']].drop_duplicates()
disease_icd_major = disease_table[['unique_id','icd_code_major']].drop_duplicates()

# Diabetes type feature table
diabetes_type_feature = generate_feature_table(disease_icd_major, disease_icd_major['icd_code_major'].str.contains('E'), 'icd_code_major')

# Save the diabetes_type_feature table to a parquet file
diabetes_type_feature.to_parquet(data_path() + '\\feature_store'+'\\diabetes_type_feature.parquet', index=False)

In [None]:
diabetes_type_feature

### 2.2.2. <a id='toc2_2_2_'></a>[Diabetes complications feature table](#toc0_)


In [38]:
# Diabetes complications feature table
diabetes_complication_feature = generate_feature_table(
    disease_table, 
    disease_table['icd_code'].str.contains('E') & disease_table['icd_code'].str.contains('\.'), 
    'icd_code'
)
diabetes_complication_feature['total_complications'] = diabetes_complication_feature.drop(columns=['unique_id']).sum(axis=1).astype(int)

# Save the diabetes complications feature table
diabetes_complication_feature.to_parquet(data_path() + '\\feature_store'+'\\diabetes_complication_feature.parquet', index=False)

### 2.2.3. <a id='toc2_2_3_'></a>[Comorbidity feature table](#toc0_)

In [41]:
# Comorbidities feature table
comorbidity_feature = generate_feature_table(disease_icd_major, ~disease_icd_major['icd_code_major'].str.contains('E'), 'icd_code_major')
comorbidity_feature['total_comorbidities'] = comorbidity_feature.drop(columns=['unique_id']).sum(axis=1).astype(int)

# Save the comorbidity feature table to a parquet file
comorbidity_feature.to_parquet(data_path() + '\\feature_store'+'\\comorbidity_feature.parquet', index=False)

### 2.2.4. <a id='toc2_2_4_'></a>[Family size feature table](#toc0_)


In [None]:
# create fanmily size feature table
def create_family_size_feature(data):
    """
    Create a 'family_size' feature by the count of unique identifiers per 'policy_number' and 'member_code'.

    Parameters:
    data (DataFrame): The input DataFrame containing the data.

    Returns:
    DataFrame: The updated DataFrame with a 'family_size' feature.
    """
    family_size_data = data.groupby(['policy_number', 'member_code'])['unique_id'].nunique().reset_index(name='family_size')
    return family_size_data

# usage of the create_family_size_feature function
family_size_table = create_family_size_feature(intermediate_data)

display(family_size_table)

# saving the family size table
family_size_table.to_parquet('..\\data\\feature_store\\family_size_table.parquet', index=False)
family_size_table.to_csv('..\\data\\feature_store\\family_size_table.csv', index=False)

### 2.2.5. <a id='toc2_2_5_'></a>[Unique identifier feature table](#toc0_)


In [44]:
# Create an identifier table with unique records
identifier_table = intermediate_data[['unique_id', 'policy_number', 'member_code', 'age_cat', 'age', 'gender']].drop_duplicates()

# Extract the maximum BMI and BMI categories for each unique_id
max_bmi = intermediate_data.groupby('unique_id')['bmi'].max().reset_index(name='max_bmi')
max_bmi_cat = intermediate_data.groupby('unique_id')['bmi_cat'].max().reset_index(name='max_bmi_cat')

# Determine the city with the maximum count per unique_id
max_city = intermediate_data.groupby(['unique_id', 'major_city']).size() \
                            .reset_index(name='count') \
                            .loc[lambda x: x.groupby('unique_id')['count'].idxmax()] \
                            .drop(columns=['count']) \
                            .rename(columns={'major_city': 'max_major_city'})

# Merge max BMI,  BMI category, and max city into the identifier table
identifier_table = identifier_table.merge(max_bmi, on='unique_id', how='left') \
                                   .merge(max_bmi_cat, on='unique_id', how='left') \
                                   .merge(max_city, on='unique_id', how='left')

# Save to Parquet and CSV
identifier_table.to_parquet('..\\data\\feature_store\\identifier_table.parquet', index=False)
identifier_table.to_csv('..\\data\\feature_store\\identifier_table.csv', index=False)


### 2.2.6. <a id='toc2_2_5_'></a>[city complications feature table](#toc0_)


In [45]:
##
## Step 2: Create the city complications data set
##

# Merge the feature tables
city_comp_df = identifier_table.merge(diabetes_type_feature, on = 'unique_id', how = 'left')\
                        .merge(diabetes_complication_feature[['unique_id', 'total_complications']], on = 'unique_id', how = 'left')\
                        .merge(comorbidity_feature[['unique_id','total_comorbidities']], on = 'unique_id', how = 'left')

# Fill missing values with 0
columns_to_fill = city_comp_df.loc[:, 'E09':'total_comorbidities'].columns
city_comp_df[columns_to_fill] = city_comp_df[columns_to_fill].fillna(0)

city_comp_df['has_icd_dm'] = city_comp_df.loc[:, 'E09':'E14'].gt(0).any(axis=1).astype(int)
city_comp_df['total_dm_icd'] = city_comp_df.loc[:, 'E09':'E14'].sum(axis=1).astype(int)

# Cleaning the data set
# since this is a data set of patient with diabetes if there is not code for diabetes then we can assign the type as unspecified diabetes mellitus (E14)
city_comp_df['total_complications'] = city_comp_df['total_complications'].astype(int)
city_comp_df['total_comorbidities'] = city_comp_df['total_comorbidities'].astype(int)

# assiging the type of diabetes to unspecified diabetes mellitus (E14) if there is no code for diabetes
city_comp_df.loc[city_comp_df['total_dm_icd'] < 1, 'E14'] = 1

# save the data set to the feature store
city_comp_df.to_parquet('..\\data\\feature_store\\city_comp_df.parquet', index = False)


In [None]:
city_comp_df.info()

# 3. <a id='toc3_'></a>[Create Lookup Tables](#toc0_)

## 3.1. <a id='toc3_1_'></a>[Standarized city name lookup table](#toc0_)
## 3.2. <a id='toc3_2_'></a>[ICD 10 code lookup table](#toc0_)

In [None]:
# creating lookup table for the data

# Create a lookup table
city_lookup = (
    intermediate_data.groupby('clean_city')['city']
    .unique()
    .reset_index()
    .rename(columns={'city': 'list_of_variants'})
)

# Save the lookup table to a CSV file
city_lookup.to_csv('..\\data\\lookup\\city_lookup.csv', index=False)

# Verify the lookup table
print("\nCity Lookup Table:\n", city_lookup)

# Create and save the ICD lookup table
icd_lookup = intermediate_data[['icd_code', 'icd_description']].drop_duplicates()

# Save the lookup table to a CSV file
icd_lookup.to_csv('..\\data\\lookup\\icd_lookup.csv', index=False)

# Verify the lookup table
print("\nICD Code Lookup Table:\n", icd_lookup)

# 4. <a id='toc4_'></a>[Saving the primary data](#toc0_)

In [47]:
# save the cleaned data to a parquet file
primary_data = intermediate_data.drop(columns=['city'])
primary_data.rename(columns={'clean_city': 'city'}, inplace=True)

primary_data.to_parquet('..\\data\\primary\\primary_data.parquet', index=False)
primary_data.to_csv('..\\data\\primary\\primary_data.csv', index=False)