In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")

All imports successful!


In [2]:
import pandas as pd
import numpy as np

np.random.seed(42)
n_records = 5000

data = {
    'Age': np.random.randint(18, 80, n_records),
    'Gender': np.random.choice(['Male', 'Female'], n_records),
    'Annual Income': np.random.randint(20000, 200000, n_records),
    'Marital Status': np.random.choice(['Single', 'Married', 'Divorced'], n_records),
    'Number of Dependents': np.random.randint(0, 6, n_records),
    'Education Level': np.random.choice(["High School", "Bachelor's", "Master's", "PhD"], n_records),
    'Occupation': np.random.choice(['Employed', 'Self-Employed', 'Unemployed'], n_records),
    'Health Score': np.random.randint(30, 100, n_records),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_records),
    'Policy Type': np.random.choice(['Basic', 'Comprehensive', 'Premium'], n_records),
    'Previous Claims': np.random.randint(0, 6, n_records),
    'Vehicle Age': np.random.randint(0, 21, n_records),
    'Credit Score': np.random.randint(300, 850, n_records),
    'Insurance Duration': np.random.randint(1, 21, n_records),
    'Smoking Status': np.random.choice(['Yes', 'No'], n_records),
    'Exercise Frequency': np.random.choice(['Daily', 'Weekly', 'Monthly', 'Rarely'], n_records),
    'Property Type': np.random.choice(['House', 'Apartment', 'Condo'], n_records),
}

df = pd.DataFrame(data)

# Generate realistic Premium Amount
base_premium = 2000
age_factor = (df['Age'] - 25) * 50
income_factor = df['Annual Income'] / 100000 * 500
health_factor = (100 - df['Health Score']) * 30
credit_factor = (750 - df['Credit Score']) * 2
claims_factor = df['Previous Claims'] * 400

policy_mult = df['Policy Type'].map({'Basic': 0.8, 'Comprehensive': 1.2, 'Premium': 1.5})
location_mult = df['Location'].map({'Urban': 1.1, 'Suburban': 1.0, 'Rural': 0.9})
smoking_mult = df['Smoking Status'].map({'Yes': 1.3, 'No': 1.0})

df['Premium Amount'] = (base_premium + age_factor + income_factor + health_factor + credit_factor + claims_factor) * policy_mult * location_mult * smoking_mult
df['Premium Amount'] = df['Premium Amount'].clip(lower=500)

df.to_csv('insurance_data.csv', index=False)
print(f" Generated sample data: insurance_data.csv ({len(df):,} rows)")
print(f" Features: {len(df.columns)}")
print(f"\nPreview:")
print(df.head())

 Generated sample data: insurance_data.csv (5,000 rows)
 Features: 18

Preview:
   Age  Gender  Annual Income Marital Status  Number of Dependents  \
0   56    Male          59263        Married                     1   
1   69  Female         128491         Single                     0   
2   46  Female          75087       Divorced                     3   
3   32  Female         193353        Married                     1   
4   60  Female          71482        Married                     4   

  Education Level     Occupation  Health Score  Location    Policy Type  \
0             PhD       Employed            93  Suburban          Basic   
1      Bachelor's       Employed            43     Rural          Basic   
2             PhD     Unemployed            52  Suburban  Comprehensive   
3             PhD       Employed            77     Rural  Comprehensive   
4        Master's  Self-Employed            57  Suburban        Premium   

   Previous Claims  Vehicle Age  Credit Score  I

In [3]:
# STEP 2: DATA PREPROCESSING

print("\n" + "=" * 60)
print("STEP 2: DATA PREPROCESSING")
print("=" * 60)

df_processed = df.copy()

# Step 2.1: Handle Missing Values
print("\n2.1 Handling Missing Values:")

# For numerical columns, fill with median
numerical_cols = df_processed.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col].fillna(df_processed[col].median(), inplace=True)
        print(f"  Filled {col} with median")

# For categorical columns, fill with mode
categorical_cols = df_processed.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
        print(f"  Filled {col} with mode")

print(f"Missing values after preprocessing: {df_processed.isnull().sum().sum()}")

# Step 2.2: Identify features and target
print("\n2.2 Feature-Target Split:")

X = df_processed.drop('Premium Amount', axis=1)
y = df_processed['Premium Amount']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Separate numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

# Step 2.3: Train-Test Split
print("\n2.3 Train-Test Split (80-20):")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


STEP 2: DATA PREPROCESSING

2.1 Handling Missing Values:
Missing values after preprocessing: 0

2.2 Feature-Target Split:
Features shape: (5000, 17)
Target shape: (5000,)
Numerical features: ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
Categorical features: ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Smoking Status', 'Exercise Frequency', 'Property Type']

2.3 Train-Test Split (80-20):
Training set size: 4000
Testing set size: 1000


In [4]:
# STEP 3: MODEL DEVELOPMENT & EVALUATION

print("\n" + "=" * 60)
print("STEP 3: MODEL DEVELOPMENT & EVALUATION")
print("=" * 60)

# Step 3.1: Create preprocessing pipeline
print("\n3.1 Creating Preprocessing Pipeline:")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Step 3.2: Create and train models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'XGBoost': XGBRegressor(random_state=42, n_estimators=100, max_depth=5)
}

results = {}

print("\n3.2 Training Models:")
print("-" * 60)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate metrics
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    
    results[name] = {
        'model': pipeline,
        'rmse_train': rmse_train,
        'rmse_test': rmse_test,
        'mae_test': mae_test,
        'r2_test': r2_test,
        'y_pred': y_pred_test
    }
    
    print(f"  Train RMSE: {rmse_train:.4f}")
    print(f"  Test RMSE: {rmse_test:.4f}")
    print(f"  Test MAE: {mae_test:.4f}")
    print(f"  Test R² Score: {r2_test:.4f}")

# Step 3.3: Compare models
print("\n3.3 Model Comparison:")
print("-" * 60)

comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train RMSE': [results[m]['rmse_train'] for m in results.keys()],
    'Test RMSE': [results[m]['rmse_test'] for m in results.keys()],
    'Test MAE': [results[m]['mae_test'] for m in results.keys()],
    'Test R²': [results[m]['r2_test'] for m in results.keys()]
})

print(comparison_df.to_string(index=False))

# Select best model (based on lowest test RMSE)
best_model_name = min(results, key=lambda x: results[x]['rmse_test'])
best_model_pipeline = results[best_model_name]['model']

print(f"\n Best Model: {best_model_name}")



STEP 3: MODEL DEVELOPMENT & EVALUATION

3.1 Creating Preprocessing Pipeline:

3.2 Training Models:
------------------------------------------------------------

Training Linear Regression...
  Train RMSE: 615.4920
  Test RMSE: 627.9211
  Test MAE: 454.0963
  Test R² Score: 0.9569

Training Decision Tree...
  Train RMSE: 458.0564
  Test RMSE: 1038.5839
  Test MAE: 802.5906
  Test R² Score: 0.8820

Training Random Forest...
  Train RMSE: 236.1487
  Test RMSE: 635.5405
  Test MAE: 494.6790
  Test R² Score: 0.9558

Training XGBoost...
  Train RMSE: 137.0141
  Test RMSE: 355.6750
  Test MAE: 273.2630
  Test R² Score: 0.9862

3.3 Model Comparison:
------------------------------------------------------------
            Model  Train RMSE   Test RMSE   Test MAE  Test R²
Linear Regression  615.492038  627.921053 454.096313 0.956859
    Decision Tree  458.056353 1038.583903 802.590586 0.881977
    Random Forest  236.148715  635.540547 494.678958 0.955805
          XGBoost  137.014117  355.67504

In [6]:
# STEP 4: MLFLOW INTEGRATION
import mlflow
import mlflow.sklearn
print("\n" + "=" * 60)
print("STEP 4: MLFLOW INTEGRATION")
print("=" * 60)

mlflow.set_experiment("SmartPremium_Insurance")

for name, result in results.items():
    print(f"\nLogging {name} to MLflow...")
    
    with mlflow.start_run(run_name=name):
        # Log parameters
        mlflow.log_param("model_type", name)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)
        
        # Log metrics
        mlflow.log_metric("rmse_train", result['rmse_train'])
        mlflow.log_metric("rmse_test", result['rmse_test'])
        mlflow.log_metric("mae_test", result['mae_test'])
        mlflow.log_metric("r2_score", result['r2_test'])
        
        # Log model
        mlflow.sklearn.log_model(result['model'], "model")
        
        print(f" {name} logged successfully")

# Log best model separately
print(f"\nLogging best model ({best_model_name}) as 'best_model'...")
with mlflow.start_run(run_name="Best_Model"):
    mlflow.log_param("model_type", best_model_name)
    mlflow.log_metric("rmse_test", results[best_model_name]['rmse_test'])
    mlflow.log_metric("r2_score", results[best_model_name]['r2_test'])
    mlflow.sklearn.log_model(best_model_pipeline, "best_model")
    print("Best model logged successfully")




STEP 4: MLFLOW INTEGRATION


2025/12/17 12:23:08 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/17 12:23:08 INFO mlflow.store.db.utils: Updating database tables
2025/12/17 12:23:08 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/17 12:23:08 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/17 12:23:09 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/17 12:23:09 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/17 12:23:09 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/17 12:23:09 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/17 12:23:09 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/17 12:23:09 INFO alembic.runtime.migration: Running 


Logging Linear Regression to MLflow...




 Linear Regression logged successfully

Logging Decision Tree to MLflow...




 Decision Tree logged successfully

Logging Random Forest to MLflow...




 Random Forest logged successfully

Logging XGBoost to MLflow...




 XGBoost logged successfully

Logging best model (XGBoost) as 'best_model'...




Best model logged successfully


In [7]:
# STEP 5: SAVE TRAINED MODEL FOR DEPLOYMENT

print("\n" + "=" * 60)
print("STEP 5: SAVING MODEL FOR DEPLOYMENT")
print("=" * 60)

import joblib

model_filename = 'best_model.pkl'
joblib.dump(best_model_pipeline, model_filename)
print(f"Best model saved as '{model_filename}'")


STEP 5: SAVING MODEL FOR DEPLOYMENT
Best model saved as 'best_model.pkl'


In [8]:
# STEP 6: PREDICTION FUNCTION FOR DEPLOYMENT

print("\n" + "=" * 60)
print("STEP 6: PREDICTION FUNCTION")
print("=" * 60)

def predict_premium(model, input_data_dict):
    """
    Predict insurance premium for a single customer
    
    Args:
        model: Trained ML pipeline
        input_data_dict: Dictionary with customer details
    
    Returns:
        Predicted premium amount
    """
    input_df = pd.DataFrame([input_data_dict])
    prediction = model.predict(input_df)[0]
    return prediction

# Example prediction
example_customer = {
    'Age': 30,
    'Gender': 'Male',
    'Annual Income': 50000,
    'Marital Status': 'Single',
    'Number of Dependents': 0,
    'Education Level': "Bachelor's",
    'Occupation': 'Employed',
    'Health Score': 75,
    'Location': 'Urban',
    'Policy Type': 'Comprehensive',
    'Previous Claims': 1,
    'Vehicle Age': 5,
    'Credit Score': 720,
    'Insurance Duration': 2,
    'Smoking Status': 'No',
    'Exercise Frequency': 'Weekly',
    'Property Type': 'Apartment'
}

predicted_premium = predict_premium(best_model_pipeline, example_customer)
print(f"\nExample Prediction:")
print(f"Customer Details: {example_customer}")
print(f"Predicted Premium: ${predicted_premium:.2f}")

print("\n" + "=" * 60)
print("MODEL TRAINING & EVALUATION COMPLETE")
print("=" * 60)
print(f"\nNext Step: Deploy the model using Streamlit")
print(f"Use the 'best_model.pkl' file for deployment")


STEP 6: PREDICTION FUNCTION

Example Prediction:
Customer Details: {'Age': 30, 'Gender': 'Male', 'Annual Income': 50000, 'Marital Status': 'Single', 'Number of Dependents': 0, 'Education Level': "Bachelor's", 'Occupation': 'Employed', 'Health Score': 75, 'Location': 'Urban', 'Policy Type': 'Comprehensive', 'Previous Claims': 1, 'Vehicle Age': 5, 'Credit Score': 720, 'Insurance Duration': 2, 'Smoking Status': 'No', 'Exercise Frequency': 'Weekly', 'Property Type': 'Apartment'}
Predicted Premium: $5460.88

MODEL TRAINING & EVALUATION COMPLETE

Next Step: Deploy the model using Streamlit
Use the 'best_model.pkl' file for deployment
