In [None]:
# %pip install ucimlrepo

from ucimlrepo import fetch_ucirepo 


In [5]:
import pandas as pd 
import numpy as np 

In [None]:
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 
  
# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables)


In [6]:
df = X.copy()
# The target variable in this dataset is named 'readmitted'
df['readmitted'] = y['readmitted'] 

# 2. CLEAN: Replace the missing value indicator '?' with actual NaNs
df.replace('?', pd.NA, inplace=True)

# ENRICHEMENT 

In [10]:
np.random.seed(42)

# A. Insurance: "Kupot Cholim" Market Share (Northern District Specific)
# Source: Knesset Research & Information Center (North Dist. Data)
kupot_names = ['Clalit', 'Maccabi', 'Leumit', 'Meuhedet']
kupot_probs = [0.70, 0.12, 0.10, 0.08]  # Clalit dominates the Northern District
df['Kupa'] = np.random.choice(kupot_names, size=len(df), p=kupot_probs)

# --- 2. Insurance Tiers (Shaban) ---
# Logic: "Basic" is the mandatory state coverage. "Gold" and "Platinum" are paid upgrades.
# In lower socioeconomic areas (periphery), "Basic" might be more common, 
# but "Gold" is heavily marketed.
Shabans = ['Basic', 'Gold', 'Platinum']
# 40% Basic, 40% Gold, 20% Platinum
Shaban_probs = [0.40, 0.40, 0.20] 
df['Shaban'] = np.random.choice(Shabans, size=len(df), p=Shaban_probs)

# B. Distance: "Poria Catchment Area" (KM)
# We want a mean of ~20km to account for Tiberias (close) + Golan/Valley (far).
# Gamma(Shape=2, Scale=10) -> Mean = 20km. 
# The "tail" will reach 60km+, capturing patients from Majdal Shams/Katzrin.
df['Distance_KM'] = np.random.gamma(shape=2, scale=10, size=len(df)).round(1)

# C. Staffing: "Shift Roster"
# Assigning realistic Israeli nurse names to simulate shift loads
nurse_roster = ['Nurit', 'Galit', 'Michal', 'Yossi', 'Amir', 'Rana', 'Salim']
df['Assigned_Nurse'] = np.random.choice(nurse_roster, size=len(df))


# --- VERIFICATION ---

print("--- Poria Operational Context Created ---")
print(df[[ 'Kupa', 'Shaban', 'Distance_KM']].head())
print("\nKupa Distribution:")
print(df['Kupa'].value_counts(normalize=True))
print("\nTier Distribution:")
print(df['Shaban'].value_counts(normalize=True))
# View the 'Long Tail' of distance to see if we caught the Golan Heights patients
print("\nDistance Percentiles (checking the 'Periphery' spread):")
print(df['Distance_KM'].describe(percentiles=[0.25, 0.5, 0.75, 0.95]))

--- Poria Operational Context Created ---
       Kupa    Shaban  Distance_KM
0    Clalit     Basic          6.7
1  Meuhedet  Platinum         63.7
2   Maccabi      Gold         20.1
3    Clalit     Basic          5.9
4    Clalit     Basic         20.2

Kupa Distribution:
Kupa
Clalit      0.701167
Maccabi     0.119971
Leumit      0.100151
Meuhedet    0.078710
Name: proportion, dtype: float64

Tier Distribution:
Shaban
Gold        0.399338
Basic       0.399122
Platinum    0.201541
Name: proportion, dtype: float64

Distance Percentiles (checking the 'Periphery' spread):
count    101766.000000
mean         20.046648
std          14.190003
min           0.000000
25%           9.600000
50%          16.800000
75%          27.000000
95%          47.500000
max         132.800000
Name: Distance_KM, dtype: float64
