In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/sample_submission.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv
/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv


In [2]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, make_scorer

#rmse function computes rmse between y_true and y_pred
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Load the datasets
df_train = pd.read_csv('/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/train.csv')
df_test = pd.read_csv('/kaggle/input/Medical-Equipments-Cost-Prediction-Challenge/test.csv')

#Clip all negative costs to zero
y_train_capped = df_train['Transport_Cost'].clip(lower=0)
y_train = np.log1p(y_train_capped) 
# Apply log(1+y) transformation to improve normality and reduce skewed data

df_train = df_train.drop('Transport_Cost', axis=1)#Drop target variable Transport_Cost from train dataframe
hospital_ids = df_test['Hospital_Id']

df_combined = pd.concat([df_train.drop(['Hospital_Id', 'Supplier_Name'], axis=1),
                         df_test.drop(['Hospital_Id', 'Supplier_Name'], axis=1)],
                        ignore_index=True)


date_cols = ['Order_Placed_Date', 'Delivery_Date']
for col in date_cols:
    df_combined[col] = pd.to_datetime(df_combined[col], format='%m/%d/%y', errors='coerce')
    
#Adding new features to dataset and dropping unnecesary features
df_combined['Delivery_Time_Days'] = (df_combined['Delivery_Date'] - df_combined['Order_Placed_Date']).dt.days
df_combined.loc[df_combined['Delivery_Time_Days'] < 0, 'Delivery_Time_Days'] = np.nan


df_combined['Order_Year'] = df_combined['Order_Placed_Date'].dt.year
df_combined['Order_Month'] = df_combined['Order_Placed_Date'].dt.month
df_combined['Order_DayOfWeek'] = df_combined['Order_Placed_Date'].dt.dayofweek
df_combined = df_combined.drop(date_cols, axis=1)


df_combined = df_combined.drop('Hospital_Location', axis=1)


num_cols = ['Equipment_Height', 'Equipment_Width', 'Equipment_Weight', 'Equipment_Value']
for col in num_cols:
    df_combined[col] = pd.to_numeric(df_combined[col], errors='coerce')

df_combined['Equipment_Volume'] = df_combined['Equipment_Height'] * df_combined['Equipment_Width']
df_combined['Density'] = df_combined['Equipment_Weight'] / (df_combined['Equipment_Volume']+ 1e-6)
df_combined['Value_per_Weight'] = df_combined['Equipment_Value'] / (df_combined['Equipment_Weight'] + 1e-6)
#Density=mass/Volume 1e-6 added to handle zero Equipment_Volume present in dataset 
#Similarly 1e-6 added to handle zero Equipment_Weight in the dataset

# Separate back into train and test sets after adding new features ,removing unnecesary features , clipping costs to zero and applying transformations
X_train = df_combined.iloc[:len(y_train)]
X_test = df_combined.iloc[len(y_train):]


#Finding numerical and categorical features
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include='object').columns.tolist()

# Define  pipeline  for numerical features : Median Imputation
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])#Scaling important for PCA+Ridge

# Define pipeline for categorical features: Impute 'missing', then One-Hot Encoding is done
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

#  Create PCA + Ridge Regression Pipeline 

pca_ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(random_state=42)),
    ('ridge', Ridge(random_state=42))
])

#  Hyperparameter grid used for tuning hyperparameters
param_grid = {'pca__n_components': [5, 10, 20, 30],'ridge__alpha': [0.1, 1.0, 10.0, 100.0]} # PCA components: search for the optimal number of components
# Ridge regularization strength (alpha): search for the best control parameter
neg_mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

#  Grid Search and Optimization 
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pca_ridge_pipeline,
    param_grid=param_grid,
    scoring=neg_mse_scorer,
    cv=cv,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train) #Fit the PCA+Ridge model with training data and check which hyperparameters are best

# Output best parameters and estimated RMSE
best_rmse = np.sqrt(-grid_search.best_score_)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Estimated CV(Cross-Validation) RMSE (on transformed target): {best_rmse:.4f}")

#  Predict transformed transport cost
y_pred_transformed = grid_search.best_estimator_.predict(X_test)

# Reverse the log transformation and ensure non-negative costs
y_pred_final = np.expm1(y_pred_transformed).clip(min=0)

# Create submission file
submission = pd.DataFrame({
    'Hospital_Id': hospital_ids,
    'Transport_Cost': y_pred_final
})

submission_filename = 'submission.csv'
submission.to_csv(submission_filename, index=False)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'pca__n_components': 30, 'ridge__alpha': 10.0}
Estimated CV(Cross-Validation) RMSE (on transformed target): 2.1214


In [3]:
print(submission.head())

            Hospital_Id  Transport_Cost
0          fffe33003400      220.234877
1  fffe3700330036003600      239.301549
2  fffe3300390038003400     2638.565183
3      fffe310030003900      121.117436
4  fffe3700330031003200      574.101602
