In [111]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [112]:
import warnings
warnings.filterwarnings("ignore")

In [113]:
data = pd.read_csv("/Users/riddhibajaj/Documents/VS Code/Projects/Datathon 2026/Access_to_Care_Dataset - Access_to_Care_Dataset.csv")
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26208 entries, 0 to 26207
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   TOPIC              26208 non-null  object 
 1   SUBTOPIC           1404 non-null   object 
 2   SUBTOPIC_ID        1404 non-null   float64
 3   TAXONOMY           26208 non-null  object 
 4   TAXONOMY_ID        26208 non-null  int64  
 5   CLASSIFICATION     26208 non-null  object 
 6   CLASSIFICATION_ID  26208 non-null  int64  
 7   GROUP              26208 non-null  object 
 8   GROUP_ID           26208 non-null  int64  
 9   GROUP_ORDER        26208 non-null  int64  
 10  SUBGROUP           26208 non-null  object 
 11  SUBGROUP_ID        26208 non-null  int64  
 12  SUBGROUP_ORDER     26208 non-null  int64  
 13  NESTING_LABEL      2016 non-null   object 
 14  NESTING_LABEL_ID   2016 non-null   float64
 15  ESTIMATE_TYPE      26208 non-null  object 
 16  ESTIMATE_TYPE_ID   262

In [97]:
cleaned_data = df[df['FLAG'].isna()][['TOPIC', 'SUBTOPIC', 'TAXONOMY', 'CLASSIFICATION', 'GROUP', 'GROUP_ID','SUBGROUP', 'SUBGROUP_ID','NESTING_LABEL', 'TIME_PERIOD','ESTIMATE', 'ESTIMATE_LCI', 'ESTIMATE_UCI']]

In [98]:
cleaned_data.to_csv("cleaned_dataset.csv", index=False)


In [116]:
df = df[df['FLAG'].isna()]

In [None]:
topic = 'Angina/angina pectoris'
subtopic = '65 years and older'

data_completeness = df.groupby(['TOPIC', 'GROUP', 'SUBGROUP']).agg({
    'TIME_PERIOD': ['count', 'min', 'max'],
    'ESTIMATE': ['mean', 'std']
}).reset_index()

data_completeness.columns = ['TOPIC', 'GROUP', 'SUBGROUP', 'Years_Count', 'First_Year', 'Last_Year', 'Mean_Estimate', 'Std_Estimate']

# Filter for combinations with at least 4 years of data (needed for prediction)
sufficient_data = data_completeness[data_completeness['Years_Count'] >= 4].copy()

print(f"Total topic-subgroup combinations: {len(data_completeness)}")
print(f"Combinations with 4+ years of data: {len(sufficient_data)}")

print("\nSample of combinations with sufficient data:")
sufficient_data[sufficient_data['TOPIC'] == topic].head(20)

Total topic-subgroup combinations: 4063
Combinations with 4+ years of data: 3928

Sample of combinations with sufficient data:


Unnamed: 0,TOPIC,GROUP,SUBGROUP,Years_Count,First_Year,Last_Year,Mean_Estimate,Std_Estimate
76,Any cancer type,Age groups with 65 years and older,18-34 years,6,2019,2024,1.066667,0.233809
77,Any cancer type,Age groups with 65 years and older,35-49 years,6,2019,2024,4.15,0.398748
78,Any cancer type,Age groups with 65 years and older,50-64 years,6,2019,2024,11.133333,0.44572
79,Any cancer type,Age groups with 65 years and older,65 years and older,6,2019,2024,25.883333,0.54191
80,Any cancer type,Age groups with 75 years and older,18-44 years,6,2019,2024,1.966667,0.136626
81,Any cancer type,Age groups with 75 years and older,45-64 years,6,2019,2024,9.716667,0.33116
82,Any cancer type,Age groups with 75 years and older,65-74 years,6,2019,2024,21.733333,0.722957
83,Any cancer type,Age groups with 75 years and older,75 years and older,6,2019,2024,31.733333,0.891441
84,Any cancer type,Disability status,With disability,6,2019,2024,19.233333,0.621825
85,Any cancer type,Disability status,Without disability,6,2019,2024,8.783333,0.278687


In [118]:
filtered_df = df[(df['TOPIC'] == topic) & (df['SUBGROUP'] == subtopic)][['TOPIC', 'SUBGROUP', 'TIME_PERIOD', 'ESTIMATE', 'ESTIMATE_LCI', 'ESTIMATE_UCI']]
filtered_df['ESTIMATE_YOY'] = filtered_df['ESTIMATE'].diff()
filtered_df['CI_WIDTH'] = (filtered_df['ESTIMATE_UCI'] - filtered_df['ESTIMATE_LCI'])
filtered_df['CI_WIDTH_CHANGE'] = filtered_df['CI_WIDTH'].diff()


In [119]:
filtered_df

Unnamed: 0,TOPIC,SUBGROUP,TIME_PERIOD,ESTIMATE,ESTIMATE_LCI,ESTIMATE_UCI,ESTIMATE_YOY,CI_WIDTH,CI_WIDTH_CHANGE
474,Any cancer type,18-34 years,2019,1.3,1.0,1.7,,0.7,
475,Any cancer type,18-34 years,2020,0.8,0.6,1.1,-0.5,0.5,-0.2
476,Any cancer type,18-34 years,2021,1.3,1.0,1.7,0.5,0.7,0.2
477,Any cancer type,18-34 years,2022,1.0,0.7,1.3,-0.3,0.6,-0.1
478,Any cancer type,18-34 years,2023,0.8,0.6,1.1,-0.2,0.5,-0.1
479,Any cancer type,18-34 years,2024,1.2,0.9,1.5,0.4,0.6,0.1


In [124]:
model_data = filtered_df[['TIME_PERIOD', 'ESTIMATE', 'CI_WIDTH', 'ESTIMATE_YOY', 'CI_WIDTH_CHANGE']].copy()
model_data['ESTIMATE_YOY'] = model_data['ESTIMATE_YOY'].fillna(0)
model_data['CI_WIDTH_CHANGE'] = model_data['CI_WIDTH_CHANGE'].fillna(0)

# Features
x = model_data[['TIME_PERIOD', 'ESTIMATE_YOY', 'CI_WIDTH_CHANGE']]

# Target
y_est = model_data['ESTIMATE']
y_ci = model_data['CI_WIDTH']

# Train models
model_est = LinearRegression().fit(x, y_est)
model_ci = LinearRegression().fit(x, y_ci)

# Predict next year
X_test = pd.DataFrame({
    'TIME_PERIOD': model_data['TIME_PERIOD'].max() + 1,
    'ESTIMATE_YOY': [x['ESTIMATE_YOY'].mean()],
    'CI_WIDTH_CHANGE': [x['CI_WIDTH_CHANGE'].mean()]
})

pred_est = model_est.predict(X_test)[0]
pred_ci = model_ci.predict(X_test)[0]

pred_lci = pred_est - pred_ci/2
pred_uci = pred_est + pred_ci/2

print(f"Linear Regression Predicted ESTIMATE: {pred_est:.2f}")
print(f"Predicted LCI: {pred_lci:.2f}")
print(f"Predicted UCI: {pred_uci:.2f}")

Linear Regression Predicted ESTIMATE: 0.89
Predicted LCI: 0.62
Predicted UCI: 1.16


In [125]:
from statsmodels.tsa.arima.model import ARIMA

# Prepare the series
ts = model_data.set_index('TIME_PERIOD')['ESTIMATE']

# Fit ARIMA model (simple example: ARIMA(p=1,d=0,q=0))
model_arima = ARIMA(ts, order=(1,0,0))
model_arima_fit = model_arima.fit()

# Forecast next year
next_year_forecast = model_arima_fit.forecast(steps=1)
pred_est = next_year_forecast.values[0]

# CI: ARIMA provides confidence intervals
conf_int = model_arima_fit.get_forecast(steps=1).conf_int()
pred_lci = conf_int.iloc[0, 0]
pred_uci = conf_int.iloc[0, 1]

print(f"ARIMA Predicted ESTIMATE: {pred_est:.2f}")
print(f"LCI: {pred_lci:.2f}, UCI: {pred_uci:.2f}")

ARIMA Predicted ESTIMATE: 0.92
LCI: 0.62, UCI: 1.22


In [126]:
from prophet import Prophet

# Prepare dataframe
prophet_df = model_data[['TIME_PERIOD', 'ESTIMATE']].rename(columns={'TIME_PERIOD':'ds', 'ESTIMATE':'y'})
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')

# Fit model
model_prophet = Prophet(yearly_seasonality=False, daily_seasonality=False)
model_prophet.fit(prophet_df)

# Predict next year
future = pd.DataFrame({'ds': pd.to_datetime([df['TIME_PERIOD'].max() + 1], format='%Y')})
forecast = model_prophet.predict(future)

pred_est = forecast['yhat'].values[0]
pred_lci = forecast['yhat_lower'].values[0]
pred_uci = forecast['yhat_upper'].values[0]

print(f"Prophet Predicted ESTIMATE: {pred_est:.2f}")
print(f"LCI: {pred_lci:.2f}, UCI: {pred_uci:.2f}")


13:52:45 - cmdstanpy - INFO - Chain [1] start processing
13:52:45 - cmdstanpy - INFO - Chain [1] done processing


Prophet Predicted ESTIMATE: 0.99
LCI: 0.71, UCI: 1.23


In [127]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet

def predict_next_year(df, topic, subgroup):
    """
    Predict next year's ESTIMATE and CI for a given topic and subgroup.
    
    df: dataframe with columns TIME_PERIOD, ESTIMATE, ESTIMATE_LCI, ESTIMATE_UCI
    topic: str, e.g., "Any cancer type" or "Delayed getting medical care due to cost among adults"
    subgroup: str, e.g., "18-34 years"
    
    Returns: dict with Linear Regression, ARIMA, and Prophet predictions
    """
    
    # Filter for topic and subgroup
    filtered_df = df[(df['TOPIC'] == topic) & (df['SUBGROUP'] == subgroup)].copy()
    filtered_df = filtered_df.sort_values('TIME_PERIOD')
    
    # Fill missing YOY or CI change
    filtered_df['ESTIMATE_YOY'] = filtered_df['ESTIMATE'].diff().fillna(0)
    filtered_df['CI_WIDTH'] = filtered_df['ESTIMATE_UCI'] - filtered_df['ESTIMATE_LCI']
    filtered_df['CI_WIDTH_CHANGE'] = filtered_df['CI_WIDTH'].diff().fillna(0)
    
    # ---------------- Linear Regression ----------------
    X_lr = filtered_df[['TIME_PERIOD', 'ESTIMATE_YOY', 'CI_WIDTH_CHANGE']]
    y_lr_est = filtered_df['ESTIMATE']
    y_lr_ci = filtered_df['CI_WIDTH']
    
    lr_model_est = LinearRegression().fit(X_lr, y_lr_est)
    lr_model_ci = LinearRegression().fit(X_lr, y_lr_ci)
    
    last_row = filtered_df.iloc[-1]
    next_year = filtered_df['TIME_PERIOD'].max() + 1
    X_test = pd.DataFrame({
        'TIME_PERIOD': [next_year],
        'ESTIMATE_YOY': [filtered_df['ESTIMATE_YOY'].mean()],
        'CI_WIDTH_CHANGE': [filtered_df['CI_WIDTH_CHANGE'].mean()]
    })
    
    lr_pred_est = lr_model_est.predict(X_test)[0]
    lr_pred_ci = lr_model_ci.predict(X_test)[0]
    lr_pred_lci = lr_pred_est - lr_pred_ci/2
    lr_pred_uci = lr_pred_est + lr_pred_ci/2
    
    # ---------------- ARIMA ----------------
    ts = filtered_df.set_index('TIME_PERIOD')['ESTIMATE']
    arima_model = ARIMA(ts, order=(1,0,0)).fit()
    arima_forecast = arima_model.get_forecast(steps=1)
    arima_pred_est = arima_forecast.predicted_mean.values[0]
    arima_conf = arima_forecast.conf_int()
    arima_pred_lci = arima_conf.iloc[0,0]
    arima_pred_uci = arima_conf.iloc[0,1]
    
    # ---------------- Prophet ----------------
    prophet_df = filtered_df[['TIME_PERIOD','ESTIMATE']].rename(columns={'TIME_PERIOD':'ds','ESTIMATE':'y'})
    prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
    prophet_model = Prophet(yearly_seasonality=False, daily_seasonality=False)
    prophet_model.fit(prophet_df)
    
    future = pd.DataFrame({'ds': pd.to_datetime([next_year], format='%Y')})
    forecast = prophet_model.predict(future)
    prophet_pred_est = forecast['yhat'].values[0]
    prophet_pred_lci = forecast['yhat_lower'].values[0]
    prophet_pred_uci = forecast['yhat_upper'].values[0]
    
    return {
        'next_year': next_year,
        'LinearRegression': {'ESTIMATE': lr_pred_est, 'LCI': lr_pred_lci, 'UCI': lr_pred_uci},
        'ARIMA': {'ESTIMATE': arima_pred_est, 'LCI': arima_pred_lci, 'UCI': arima_pred_uci},
        'Prophet': {'ESTIMATE': prophet_pred_est, 'LCI': prophet_pred_lci, 'UCI': prophet_pred_uci}
    }


topics_to_predict = [
    "Any cancer type",
    "Delayed getting medical care due to cost among adults",
    "Did not get needed medical care due to cost",
    "Did not get needed mental health care due to cost",
    "Did not take medication as prescribed to save money",
    "Has a usual place of care among adults"
]

subgroup = "18-34 years"

for topic in topics_to_predict:
    result = predict_next_year(df, topic, subgroup)
    print(f"\nTopic: {topic}, Subgroup: {subgroup}, Next Year: {result['next_year']}")
    for model_name in ['LinearRegression','ARIMA','Prophet']:
        est = result[model_name]['ESTIMATE']
        lci = result[model_name]['LCI']
        uci = result[model_name]['UCI']
        print(f"{model_name}: ESTIMATE={est:.2f}, LCI={lci:.2f}, UCI={uci:.2f}")


14:12:39 - cmdstanpy - INFO - Chain [1] start processing
14:12:39 - cmdstanpy - INFO - Chain [1] done processing
14:12:39 - cmdstanpy - INFO - Chain [1] start processing
14:12:39 - cmdstanpy - INFO - Chain [1] done processing
14:12:39 - cmdstanpy - INFO - Chain [1] start processing
14:12:39 - cmdstanpy - INFO - Chain [1] done processing
14:12:39 - cmdstanpy - INFO - Chain [1] start processing
14:12:39 - cmdstanpy - INFO - Chain [1] done processing
14:12:39 - cmdstanpy - INFO - Chain [1] start processing
14:12:39 - cmdstanpy - INFO - Chain [1] done processing
14:12:39 - cmdstanpy - INFO - Chain [1] start processing
14:12:39 - cmdstanpy - INFO - Chain [1] done processing



Topic: Any cancer type, Subgroup: 18-34 years, Next Year: 2025
LinearRegression: ESTIMATE=0.89, LCI=0.62, UCI=1.16
ARIMA: ESTIMATE=0.92, LCI=0.62, UCI=1.22
Prophet: ESTIMATE=0.99, LCI=0.71, UCI=1.26

Topic: Delayed getting medical care due to cost among adults, Subgroup: 18-34 years, Next Year: 2025
LinearRegression: ESTIMATE=7.39, LCI=6.50, UCI=8.27
ARIMA: ESTIMATE=9.64, LCI=7.84, UCI=11.44
Prophet: ESTIMATE=9.27, LCI=8.07, UCI=10.40

Topic: Did not get needed medical care due to cost, Subgroup: 18-34 years, Next Year: 2025
LinearRegression: ESTIMATE=5.94, LCI=5.20, UCI=6.67
ARIMA: ESTIMATE=7.97, LCI=6.26, UCI=9.68
Prophet: ESTIMATE=7.42, LCI=6.35, UCI=8.50

Topic: Did not get needed mental health care due to cost, Subgroup: 18-34 years, Next Year: 2025
LinearRegression: ESTIMATE=10.79, LCI=9.89, UCI=11.68
ARIMA: ESTIMATE=9.79, LCI=8.38, UCI=11.20
Prophet: ESTIMATE=10.85, LCI=10.62, UCI=11.08

Topic: Did not take medication as prescribed to save money, Subgroup: 18-34 years, Next Yea