# Overview
The goal of this challenge is to predict future claims for health insurance enrollees. This will help the healthcare insurer adequately set the premiums.
The data consists of patient profile including patient age, sex ,chronic conditions and severity of each chronic condition, history of outpatient claims for past few years is also included.
The goal is to predict the total claims for the next year.

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import optuna
from scipy.stats import zscore
from sklearn.model_selection import KFold
# 1. Load the data
train_json_path = 'patient_data_train.json'
test_json_path = 'patient_data_test.json'
train_csv_path = 'train.csv'
submission_csv_path = 'sample_submission.csv'

# Load JSON files
with open(train_json_path, 'r') as f:
    train_json_data = json.load(f)

with open(test_json_path, 'r') as f:
    test_json_data = json.load(f)

# Load CSV files
train_csv = pd.read_csv(train_csv_path)
submission_csv = pd.read_csv(submission_csv_path)

# 2. Preprocess the data and add the severity column
def process_json_data(json_data):
    processed_data = []
    for patient in json_data:
        patient_id = patient['PatientID']
        sex = 1 if patient['Sex'] == 'M' else 0
        age = patient['Age']
        conditions = patient['Conditions']

        total_severity = sum(conditions.values())

        outpatient_costs = patient.get('Out patient costs', {})
        avg_cost = {year: outpatient_costs.get(str(year), 0) for year in range(2019, 2024)}
        
        data_row = {
            'PatientID': patient_id,
            'Sex': sex,
            'Age': age,
            'Conditions_AT': conditions.get('AT', 0),
            'Conditions_DB': conditions.get('DB', 0),
            'Conditions_HT': conditions.get('HT', 0),
            'Conditions_HD': conditions.get('HD', 0),
            'TotalSeverity': total_severity,
            'Outpatient_2019': avg_cost[2019],
            'Outpatient_2020': avg_cost[2020],
            'Outpatient_2021': avg_cost[2021],
            'Outpatient_2022': avg_cost[2022],
            'Outpatient_2023': avg_cost[2023]
        }
        processed_data.append(data_row)
    
    return pd.DataFrame(processed_data)

# Process both train and test data
train_df = process_json_data(train_json_data)
test_df = process_json_data(test_json_data)

# 3. Merge the train CSV with the processed train JSON data
train_merged = pd.merge(train_df, train_csv, on='PatientID')

# Drop 'PatientID' column since it's not needed for training
train_merged.drop(columns=['PatientID'], inplace=True)

# Outlier removal based on TotalClaims using z-score
train_merged['zscore'] = zscore(train_merged['TotalClaims'])
train_merged = train_merged[train_merged['zscore'].abs() < 3]

# 4. Split the Data
X = train_merged.drop(columns=['TotalClaims', 'zscore'])
y = train_merged['TotalClaims']
X_train_important, X_val_important, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Feature Engineering
def add_advanced_features(df):
    """
    Adds new features to the dataset for better feature representation.
    """
    if 'Outpatient_2023' in df.columns:
        df['Outpatient_2023_log'] = np.log1p(df['Outpatient_2023'])  # Log transformation
    
    if 'Age' in df.columns:
        df['Age_Squared'] = df['Age'] ** 2
        df['Age_Cubed'] = df['Age'] ** 3  # Cubic feature
    
    if 'TotalSeverity' in df.columns:
        df['Severity_Squared'] = df['TotalSeverity'] ** 2
        df['Severity_Cubed'] = df['TotalSeverity'] ** 3  # Cubic feature
    
    # Interaction features
    df['Age_Severity_Interaction'] = df['Age'] * df['TotalSeverity']
    
    return df

# Apply feature engineering to training, validation, and test sets
X_train_important = add_advanced_features(X_train_important)
X_val_important = add_advanced_features(X_val_important)
test_df = add_advanced_features(test_df)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_important)
X_val_scaled = scaler.transform(X_val_important)
test_df_scaled = scaler.transform(test_df[X_train_important.columns])

# 6. Hyperparameter Tuning with Bayesian Optimization using Optuna
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)  # Regularization parameter
    }
    
    model = xgb.XGBRegressor(objective='reg:squarederror', **param)
    
    model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)],
              early_stopping_rounds=50, verbose=False)
    
    y_pred = model.predict(X_val_scaled)
    mae = mean_absolute_error(y_val, y_pred)
    return mae

# Run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150)

best_params = study.best_params
print(f"Best Parameters (XGBoost): {best_params}")

# 7. Train Final Model with Best Hyperparameters
best_xgb = xgb.XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    random_state=42
)

best_xgb.fit(
    X_train_scaled,
    y_train,
    eval_set=[(X_val_scaled, y_val)],
    early_stopping_rounds=50,
    verbose=True
)

# 8. Model Evaluation
y_val_pred_xgb = best_xgb.predict(X_val_scaled)
val_mae_xgb = mean_absolute_error(y_val, y_val_pred_xgb)
print(f"Validation MAE (XGBoost) after tuning: {val_mae_xgb:.2f}")


# Cross-Validation (Use KFold for continuous target variable)
cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores_xgb = cross_val_score(best_xgb, X_train_scaled, y_train, cv=cv, scoring='neg_mean_absolute_error')
mean_cv_score_xgb = -np.mean(cv_scores_xgb)
print(f"Mean Cross-Validation MAE (XGBoost): {mean_cv_score_xgb:.2f}")


# Calculate Validation Percentage Error for XGBoost
val_percentage_error_xgb = np.mean(np.abs((y_val - y_val_pred_xgb) / y_val)) * 100
print(f"Validation Percentage Error (XGBoost): {val_percentage_error_xgb:.2f}%")

# 9. Predict on the test set and prepare submission
test_predictions_xgb = best_xgb.predict(test_df_scaled)

submission_df_xgb = submission_csv.copy()
submission_df_xgb['TotalClaims'] = test_predictions_xgb

submission_filename = 'final_submission_xgb_optimized_13.csv'
submission_df_xgb.to_csv(submission_filename, index=False)
print(f"Submission file '{submission_filename}' created successfully!")

# Expected Output -
# Validation MAE (XGBoost) after tuning: 421.35
# Mean Cross-Validation MAE (XGBoost): 441.43
# Validation Percentage Error (XGBoost): 1.21%
# Submission file 'final_submission_xgb_optimized_13.csv' created successfully!