In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/sample_submission.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv


In [2]:
#Importing libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostRegressor  
from sklearn.metrics import mean_squared_error, make_scorer

# Configuration and Constants 
CLIP_FLOOR = 1.0     #Negative/zero costs clipped to this value before log transform
FREQ_THRESHOLD = 0.05 #Group categories with frequency < 5%
N_SPLITS_CV = 10         
RANDOM_STATE = 42

# Data Loading
try:
    train_df = pd.read_csv("/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv")
    test_df = pd.read_csv("/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv")
except FileNotFoundError:
    print("Error: Data files not found.")
    exit()

test_ids = test_df['Hospital_Id']
train_df.set_index('Hospital_Id', inplace=True) 
test_df.set_index('Hospital_Id', inplace=True)

# Feature Engineering
y_train = train_df["Transport_Cost"].copy()
X_train = train_df.drop(columns=["Transport_Cost"]).copy()
X_test = test_df.copy()

combined_df = pd.concat([X_train, X_test], axis=0)

# Best-performing transformation: Clip and Log
y_train_transformed = np.log(y_train.clip(lower=CLIP_FLOOR))

# KEEPING THE DATE FEATURES
combined_df['Order_Placed_Date'] = pd.to_datetime(combined_df['Order_Placed_Date'], format='%m/%d/%y', errors='coerce')
combined_df['Delivery_Date'] = pd.to_datetime(combined_df['Delivery_Date'], format='%m/%d/%y', errors='coerce')
combined_df['Delivery_Lag_Days'] = (combined_df['Delivery_Date'] - combined_df['Order_Placed_Date']).dt.days.fillna(0).astype(int)
combined_df['Order_Day_of_Week'] = combined_df['Order_Placed_Date'].dt.dayofweek
combined_df['Order_Month'] = combined_df['Order_Placed_Date'].dt.month

combined_df['Equipment_Volume'] = combined_df['Equipment_Height'] * combined_df['Equipment_Width']
combined_df['Equipment_Density'] = combined_df['Equipment_Weight'] / (combined_df['Equipment_Volume'] + 1e-6)

combined_df.drop(columns=['Order_Placed_Date', 'Delivery_Date', 'Supplier_Name', 'Hospital_Location'], inplace=True, errors='ignore')

# Binary and Categorical Mapping
binary_map = {'Yes': 1, 'No': 0}
binary_cols = ['CrossBorder_Shipping', 'Installation_Service', 'Rural_Hospital', 'Urgent_Shipping', 'Fragile_Equipment']

for col in binary_cols:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].map(binary_map).fillna(0) # Fill NaNs with 0 ('No')

# Group Low-Frequency Categorical Features
categorical_cols_to_group = ['Equipment_Type', 'Transport_Method', 'Hospital_Info']
for col in categorical_cols_to_group:
    if col in combined_df.columns:
        train_counts = combined_df.iloc[:len(X_train)][col].value_counts(normalize=True)
        low_freq_cats = train_counts[train_counts < FREQ_THRESHOLD].index
        combined_df[col] = np.where(combined_df[col].isin(low_freq_cats), 'Other', combined_df[col])

# Final Data Preparation
X_train_clean = combined_df.iloc[:len(X_train)]
X_test_clean = combined_df.iloc[len(X_train):]


# Define Preprocessing Pipeline
numeric_cols = [col for col in X_train_clean.select_dtypes(include=np.number).columns.tolist() if col not in binary_cols]
categorical_cols = X_train_clean.select_dtypes(include=['object']).columns.tolist()

#Numerical columns median imputation is done 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  
])

#Categorical columns One-hot encoding is done
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough' # Passes through the binary columns
)
          
# Fit AdaBoost Model
ada_model = AdaBoostRegressor(
    n_estimators=300,        # Number of boosting stages
    learning_rate=0.1,       # Shrinks contribution of each estimator
    loss='square',           # The loss function to use when updating the weights
    random_state=RANDOM_STATE
)

# Create the full pipeline
adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ada_model)
])

# Fit the pipeline on the entire training dataset
adaboost_pipeline.fit(X_train_clean, y_train_transformed)
print("Model training complete.")

# Predict on test data using the fitted pipeline
test_pred_transformed = adaboost_pipeline.predict(X_test_clean)

# Reverse log transform
test_pred = np.exp(test_pred_transformed).clip(min=1.0) # Clip at 1.0

# Create submission file
submission = pd.DataFrame({
    'Hospital_Id': test_ids,
    'Transport_Cost': test_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

Model training complete.
Submission file created: submission.csv
