In [2]:
import numpy as np
def dummy_npwarn_decorator_factory():
  def npwarn_decorator(x):
    return x
  return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)

In [1]:
import pandas as pd
from utils import data_path
# from ydata_profiling import ProfileReport

In [2]:

# specify the path to save the intermediate data
intermediate_data_path = data_path() + '\\intermediate'+'\\intermediate_data.parquet'

# Save the intermediate data to a parquet file
intermediate_data = pd.read_parquet(intermediate_data_path)

# Using ydata-profiling to conduct Data profiling and EDA

In [None]:
# profile = ProfileReport(intermediate_data, minimal=True)

# # saving the report to a file
# profile.to_file('..\\reports\\intermediate_data_profiling.html')



# Creating data features within the data set (age, bmi, ICD category)

## Create age groups 
- age_cat -> 10 year intervals

## Create BMI groups
- bmi_cat -> logical groups:
    underweight [<18.5], 
    healthy [18.5 to <25], 
    overweight [25 to <30], 
    obesity [=> 30]
- obesity_cat  
    class 1 [30 to <35]
    class 2 [35 to <40]
    class 3 [=>40]

## Extract ICD 10 Category


In [None]:
# Creating categorical variables for `age` and `bmi`

## Feature Engineering: Age and BMI Groups

# Create age groups
intermediate_data['age_cat'] = pd.cut(
    intermediate_data['age'],
    bins=range(0, 121, 10),  # 10-year intervals
    labels=[f"{i}-{i+9}" for i in range(0, 120, 10)],
    right=False
)

# Create BMI categories
intermediate_data['bmi_cat'] = pd.cut(
    intermediate_data['bmi'],
    bins=[-float('inf'), 18.5, 25, 30, float('inf')],
    labels=['Underweight', 'Healthy', 'Overweight', 'Obesity'],
    right=False
)

# Create Obesity categories
intermediate_data['obesity_cat'] = pd.cut(
    intermediate_data['bmi'],
    bins=[-float('inf'), 30, 35, 40, float('inf')],
    labels=['Not Obese','Class 1 Obesity', 'Class 2 Obesity', 'Class 3 Obesity'],
    right=False
)

# Check the distribution of the newly created categories
print("Age Categories Distribution:")
print(intermediate_data['age_cat'].value_counts())

print("BMI Categories Distribution:")
print(intermediate_data['bmi_cat'].value_counts())

print("Obesity Categories Distribution:")
print(intermediate_data['obesity_cat'].value_counts())


In [None]:
intermediate_data

# Creating Features and storing them in the feature store

## Using encoding (to remove order)
use the funciton get_dummies for all cities, for major cities we replace all cities ranked >4 as other first
- City encoding 
- Top Major cities (4  + 'other') encoding
- 

In [85]:
# Group by 'city' and count the number of unique 'unique_id' values
unique_id_counts_by_city = intermediate_data.groupby('city')['unique_id'].nunique().reset_index()

# Rename the columns for clarity
unique_id_counts_by_city.columns = ['city', 'unique_unique_id_count']

# Print the result
print(unique_id_counts_by_city.sort_values('unique_unique_id_count', ascending=False))

# Get the top 5 cities by unique_id count
top_5_cities = unique_id_counts_by_city.nlargest(6, 'unique_unique_id_count')['city']
# Filter out 'Unknown' from the top 5 cities if it exists
top_5_cities = top_5_cities[top_5_cities != 'Unknown']

# Create the 'major_city' variable
intermediate_data['major_city'] = intermediate_data['city'].apply(lambda x: x if x in top_5_cities.values else 'Other')

cities_table = intermediate_data[['city', 'major_city']].drop_duplicates()

# Print the updated DataFrame
print(cities_table)

               city  unique_unique_id_count
28           Jeddah                    6697
46           Riyadh                    5288
11         Alkhobar                     889
59          Unknown                     840
33           Madina                     780
..              ...                     ...
1              Afif                       2
6        Al Khormah                       1
40          Oyaynah                       1
47  Riyadh Al-Kabra                       1
58           Taroot                       1

[64 rows x 2 columns]
                   city major_city
0                Riyadh     Riyadh
12              Unknown      Other
16               Jeddah     Jeddah
56             Alkhobar   Alkhobar
236              Makkah     Makkah
...                 ...        ...
95960           Methnab      Other
108639      Ras Tannura      Other
112609        Quaieyyah      Other
152192          Oyaynah      Other
172095  Riyadh Al-Kabra      Other

[64 rows x 2 columns]


In [90]:
cities_table

Unnamed: 0,city,major_city
0,Riyadh,Riyadh
12,Unknown,Other
16,Jeddah,Jeddah
56,Alkhobar,Alkhobar
236,Makkah,Makkah
...,...,...
95960,Methnab,Other
108639,Ras Tannura,Other
112609,Quaieyyah,Other
152192,Oyaynah,Other


In [92]:
# Creating city enconding and storing them in the feature store

# Assuming intermediate_data is your DataFrame and it contains 'city' and 'major_city' columns

# Dummy encode the 'city' column
city_dummies = pd.get_dummies(cities_table[['city']], columns = ['city'], prefix='city')

# Save the city dummies to a Parquet file
city_dummies.to_parquet('..\\data\\feature_store\\city_dummies.parquet', index=False)

# Dummy encode the 'major_city' column
major_city_dummies = pd.get_dummies(cities_table[['major_city']].drop_duplicates(), prefix='major_city')

# Save the major city dummies to a Parquet file
major_city_dummies.to_parquet('..\\data\\feature_store\\major_city_dummies.parquet', index=False)



major_city_dummies

Unnamed: 0,major_city_Alkhobar,major_city_Jeddah,major_city_Madina,major_city_Makkah,major_city_Other,major_city_Riyadh
0,False,False,False,False,False,True
12,False,False,False,False,True,False
16,False,True,False,False,False,False
56,True,False,False,False,False,False
236,False,False,False,True,False,False
1456,False,False,True,False,False,False


In [71]:
icd_lookup = intermediate_data[['icd_code', 'icd_description']].drop_duplicates()
icd_lookup

In [None]:
import pandas as pd

# Example DataFrame
data = {
    'icd_code': ['E11.11', 'A01.0', 'B20.1', 'C34.9', 'D50.0', 'E1']
}
example = pd.DataFrame(data)

# Function to extract parts of the ICD-10 code
def extract_icd_parts(icd_code):
    if pd.isnull(icd_code):
        return pd.Series(['Unknown', 'Unknown', 'Unknown'])
    parts = icd_code.split('.')
    letter_part = icd_code[0]
    first_level_number = icd_code[1] if len(icd_code) > 1 else 'Unknown'
    second_level_number = parts[1] if len(parts) > 1 else 'Unknown'
    return pd.Series([letter_part, first_level_number, second_level_number])

# Apply the function to create new columns
example[['icd_letter', 'icd_first_level', 'icd_second_level']] = example['icd_code'].apply(extract_icd_parts)

# Print the updated DataFrame
print("Updated DataFrame with ICD-10 parts:")
print(example)


# Feature Creation

We will be creating the following features and storing the outcomes in a feature store

In [94]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming intermediate_data is your DataFrame and it contains 'city' and 'icd_code' columns

# Create a matrix where rows represent cities and columns represent ICD-10 codes
icd_code_matrix = intermediate_data.pivot_table(index='city', columns='icd_code', aggfunc='size', fill_value=0)

# Compute the cosine similarity between cities based on the ICD-10 code distributions
similarity_matrix = cosine_similarity(icd_code_matrix)

# Convert the similarity matrix to a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=icd_code_matrix.index, columns=icd_code_matrix.index)

# Print the similarity DataFrame
print("City Similarity Scores based on ICD-10 codes:")
print(similarity_df)

City Similarity Scores based on ICD-10 codes:
city                 Abha      Afif     Aflaj   Al Baha  Al Dwadmi  Al Kharj  \
city                                                                           
Abha             1.000000  0.613813  0.708835  0.949481   0.912564  0.911439   
Afif             0.613813  1.000000  0.358671  0.578594   0.638861  0.566917   
Aflaj            0.708835  0.358671  1.000000  0.809324   0.591896  0.839498   
Al Baha          0.949481  0.578594  0.809324  1.000000   0.893997  0.948068   
Al Dwadmi        0.912564  0.638861  0.591896  0.893997   1.000000  0.817952   
...                   ...       ...       ...       ...        ...       ...   
Unknown          0.957621  0.633060  0.733549  0.977722   0.933682  0.923014   
Wadi Al Dawasir  0.844013  0.625404  0.475547  0.835817   0.851738  0.758396   
Wajh             0.909424  0.583232  0.639436  0.933556   0.893990  0.843564   
Yanbu            0.964018  0.658017  0.737268  0.956375   0.914443  0.9321

  icd_code_matrix = intermediate_data.pivot_table(index='city', columns='icd_code', aggfunc='size', fill_value=0)
