In [None]:
import pandas as pd
# Load the test dataset
test_file_path = "test.csv"  
test_df = pd.read_csv(test_file_path)

print("Test Dataset Info:")
print(test_df.info())

Test Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    800000 non-null  int64  
 1   Age                   787511 non-null  float64
 2   Gender                800000 non-null  object 
 3   Annual Income         770140 non-null  float64
 4   Marital Status        787664 non-null  object 
 5   Number of Dependents  726870 non-null  float64
 6   Education Level       800000 non-null  object 
 7   Occupation            560875 non-null  object 
 8   Health Score          750551 non-null  float64
 9   Location              800000 non-null  object 
 10  Policy Type           800000 non-null  object 
 11  Previous Claims       557198 non-null  float64
 12  Vehicle Age           799997 non-null  float64
 13  Credit Score          708549 non-null  float64
 14  Insurance Duration    799998 non-

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

test_df['Policy Start Year'] = pd.to_datetime(test_df['Policy Start Date'], errors='coerce').dt.year
test_df.drop(['Policy Start Date'], axis=1, inplace=True)


test_df.fillna({
    'Number of Dependents': test_df['Number of Dependents'].median(),
    'Credit Score': test_df['Credit Score'].median(),
    'Health Score': test_df['Health Score'].median(),
    'Previous Claims': test_df['Previous Claims'].median(),
    'Vehicle Age': test_df['Vehicle Age'].median()
}, inplace=True)


test_df.drop(['Customer Feedback'], axis=1, inplace=True)

# One-hot encoding for categorical variables
test_df = pd.get_dummies(test_df, columns=[
    'Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 
    'Policy Type', 'Smoking Status', 'Exercise Frequency', 'Property Type'
], dtype=int)

# Handle any remaining NaN values after encoding
test_df.fillna(0, inplace=True)  

# Feature Scaling (Standardization) for numerical variables
test_df['Annual Income'] = (test_df['Annual Income'] - test_df['Annual Income'].mean()) / test_df['Annual Income'].std()
test_df['Premium Amount'] = (test_df['Premium Amount'] - test_df['Premium Amount'].mean()) / test_df['Premium Amount'].std()

  test_df['Policy Start Year'] = pd.to_datetime(test_df['Policy Start Date'], errors='coerce').dt.year


In [None]:
# Define Features (X) for the test set
X_test = test_df.drop(['Premium Amount'], axis=1)  

In [5]:
import joblib

# Load the trained model
loaded_model = joblib.load("xgboost_model.pkl")

# Make predictions on the test set
y_test_pred = loaded_model.predict(X_test)

In [None]:
y_test_true = test_df['Premium Amount'] 

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)

print(f"Evaluation Metrics on Test Data:\nMAE: {mae:.2f}, RMSE: {rmse:.2f}, R² Score: {r2:.2f}")

Evaluation Metrics on Test Data:
MAE: 0.99, RMSE: 1.02, R² Score: -695647471904035479730807373824.00


#bash--->  mlflow ui

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Set the tracking URI
mlflow.set_tracking_uri("http://localhost:5000")  

# Load the dataset
df = pd.read_csv("test.csv")

# Preprocessing function
def preprocess_data(df):
    # Convert 'Policy Start Date' to datetime and extract year
    df['Policy Start Year'] = pd.to_datetime(df['Policy Start Date'], errors='coerce').dt.year
    df.drop(['Policy Start Date'], axis=1, inplace=True)

    # Handle missing values
    df.fillna({
        'Number of Dependents': df['Number of Dependents'].median(),
        'Credit Score': df['Credit Score'].median(),
        'Health Score': df['Health Score'].median(),
        'Previous Claims': df['Previous Claims'].median(),
        'Vehicle Age': df['Vehicle Age'].median()
    }, inplace=True)

    # Dropping unnecessary columns
    df.drop(['Customer Feedback'], axis=1, inplace=True)

    return df

# Preprocessing the data
df = preprocess_data(df)

# Defining features and target variable
X = df.drop(['Premium Amount'], axis=1)
y = df['Premium Amount']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define the preprocessing for numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Creating a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Creating a pipeline with XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=100, random_state=42))
])

# Starting an MLflow run
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 100)

    # Training the model
    pipeline.fit(X_train, y_train)

    # Making predictions
    y_pred = pipeline.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(pipeline, "model")

   
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R² Score: {r2:.2f}")

  df['Policy Start Year'] = pd.to_datetime(df['Policy Start Date'], errors='coerce').dt.year


MAE: 0.00, RMSE: 0.00, R² Score: -37354656625566768.00
🏃 View run secretive-tern-80 at: http://localhost:5000/#/experiments/0/runs/15cd5787219d413d87c2fcbaaaf6c9e5
🧪 View experiment at: http://localhost:5000/#/experiments/0
