In [None]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the Dataset
# Replace the path with the location of your dataset
df = pd.read_csv('hospital_charges.csv')

# Step 3: Explore the Data
print(df.head())
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Step 4: Data Preprocessing
# Handle missing values (if any)
df = df.dropna()  # Simple method, or you can fill with mean/median values

# Encode categorical variables
categorical_features = ['sex', 'region', 'smoker']  # Adjust based on your dataset
numerical_features = ['age', 'bmi', 'children']      # Adjust based on your dataset

# Define a column transformer to preprocess both numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Step 5: Define Features and Target
X = df.drop('charges', axis=1)  # Features (all except target)
y = df['charges']               # Target (hospital charges)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Define Machine Learning Models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

# Step 7: Train and Evaluate Models
results = {}
for name, model in models.items():
    # Create a pipeline with the preprocessor and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {'MSE': mse, 'R2': r2}
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Step 8: Visualize Model Performance
result_df = pd.DataFrame(results).T  # Convert results dictionary to DataFrame

# Plot R2 scores
plt.figure(figsize=(8, 6))
sns.barplot(x=result_df.index, y='R2', data=result_df)
plt.title('Model Performance Comparison (R2 Score)')
plt.ylabel('R2 Score')
plt.show()

# Step 9: Predict Hospital Charges for New Data (Example)
new_data = pd.DataFrame({
    'age': [40],
    'sex': ['male'],
    'bmi': [30.5],
    'children': [2],
    'smoker': ['yes'],
    'region': ['southeast']
})

# Use the best-performing model (e.g., Random Forest) for prediction
best_model = models['Random Forest']
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', best_model)])
pipeline.fit(X_train, y_train)  # Re-train the best model

predicted_charge = pipeline.predict(new_data)
print(f"Predicted Hospital Charge: ${predicted_charge[0]:.2f}")
