In [1]:
import pandas as pd
from sqlalchemy import create_engine


In [2]:
# Note:: The make sure you use the information from your specific PostgreSQL installation
host = r'127.0.0.1' # denotes that the db in a local installation
db = r'MSDS610' # db we just created
user = r'postgres' # using the postgres user for this demo
pw = r'8751' # this is the password established during installation
port = r'5432' # default port estabalished during install
schema = r'cleaned' # schema we just created

In [3]:
# Reading the data from the database
engine = create_engine(f'postgresql://{user}:{pw}@{host}:{port}/{db}')



In [4]:
# Reading the data from insurance_data table
query = 'SELECT * FROM {}.insurance_data_fact'.format('cleaned')
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,0.6,1,0.11,1.0,1,1,1,1,1,1,1,0.58
1,0.15,2,0.23,0.4,1,2,1,2,2,2,1,0.58
2,0.43,1,0.84,0.4,1,3,2,2,2,1,1,0.58
3,0.15,1,0.06,0.0,2,2,2,3,3,2,2,0.29
4,0.66,1,0.63,0.6,1,2,1,2,3,2,2,0.54


## Analytical Question

### **Primary Question:**
- **Can we accurately predict insurance charges based on demographic, lifestyle, and health-related factors?**

### **Reasoning:**
- Insurance charges are influenced by multiple factors such as age, BMI, smoking habits, medical history, and coverage level.
- Understanding which features contribute most to the prediction can help improve pricing strategies and customer segmentation.

### **Key Factors Considered:**
- **Demographics:** Age, gender, region, occupation.
- **Health Metrics:** BMI, medical history, family medical history.
- **Lifestyle Factors:** Smoking status, exercise frequency.
- **Financial Aspects:** Coverage level, number of dependents.

## Feature Engineering Methods

### 1. BMI Category
- **Description**: Converts BMI into categorical groups.
- **Method**: Used `pd.cut` to classify BMI values into categories:
  - Underweight (BMI < 18.5)
  - Normal (18.5 ≤ BMI < 24.9)
  - Overweight (25 ≤ BMI < 29.9)
  - Obese (BMI ≥ 30)

### 2. Health Risk Score
- **Description**: A composite score to assess health risks.
- **Method**: Calculated as:
  - `Health Risk Score = medical_history + family_medical_history + (BMI * 2)`
  - Higher scores indicate higher health risks.

### 3. Smoker-Exercise Interaction
- **Description**: Captures the effect of smoking and exercise frequency together.
- **Method**: Computed as:
  - `Smoker-Exercise Interaction = smoker * exercise_frequency`
  - Helps assess how smoking and exercise jointly impact charges.

### 4. Age Group
- **Description**: Categorizes age into three meaningful groups.
- **Method**: Used `pd.cut` to classify age into:
  - Young (Age < 30)
  - Middle-Aged (30 ≤ Age < 50)
  - Senior (Age ≥ 50)

### 5. Children Dependents Category
- **Description**: Buckets the number of children into different dependency categories.
- **Method**: Used `pd.cut` to create groups:
  - No children (Children = 0)
  - 1-2 children (0 < Children ≤ 2)
  - 3+ children (Children > 3)


These engineered features are designed to enhance model performance by capturing interactions and meaningful patterns in the data.


## Data Preprocessing and Scaling Considerations

### **Data Transformation Process**
- As part of the Week 3 assignment, I initially scaled the dataset to ensure uniformity across features.
- However, I later needed to extract specific features from the original raw schema without scaling.
- This required re-importing the data and applying transformations directly to the unscaled version.

### **Key Learning:**
- **Scaling data before storage can be beneficial** to maintain consistency and improve model performance.
- However, **storing only scaled data may limit feature extraction** since some transformations require access to raw values.

In [5]:
# Reading the data from insurance_data table
query = 'SELECT * FROM {}.insurance_data'.format('raw')
df = pd.read_sql(query, engine)
df.head()

# Replacing the missing values with 'Unknown/Not Provided'
df['medical_history'] = df['medical_history'].fillna('Unknown/Not Provided')
df['family_medical_history'] = df['family_medical_history'].fillna('Unknown/Not Provided')


categorical_cols = df.select_dtypes(include=['object']).columns


# Dictionary to store mappings for each categorical column
dim_tables = {}

for col in categorical_cols:
    unique_values = df[col].dropna().unique()  # Get unique categories
    dim_df = pd.DataFrame({f"{col}_id": range(1, len(unique_values) + 1), col: unique_values})
    dim_tables[col] = dim_df
    
    # Replace categorical values in the main table with their ID
    mapping_dict = dict(zip(dim_df[col], dim_df[f"{col}_id"]))
    df[col] = df[col].map(mapping_dict)

In [6]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,1,21.45,5,1,1,1,1,1,1,1,20460.307669
1,25,2,25.38,2,1,2,1,2,2,2,1,20390.899218
2,38,1,44.88,2,1,3,2,2,2,1,1,20204.476302
3,25,1,19.89,0,2,2,2,3,3,2,2,11789.029843
4,49,1,38.21,3,1,2,1,2,3,2,2,19268.309838


In [7]:
# 1. BMI Category
df['bmi_category'] = pd.cut(df['bmi'], 
                           bins=[-float('inf'), 18.5, 24.9, 29.9, float('inf')],
                           labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# 2. Health Risk Score
df['health_risk_score'] = df['medical_history'] + df['family_medical_history'] + (df['bmi'] * 2)

# 3. Smoker-Exercise Interaction
df['smoker_exercise_interaction'] = df['smoker'] * df['exercise_frequency']

# 4. Age Group
df['age_group'] = pd.cut(df['age'], 
                        bins=[-float('inf'), 30, 45, float('inf')],
                        labels=['Young', 'Middle-Aged', 'Senior'])

# 5. Children Dependents Category
df['children_category'] = pd.cut(df['children'], 
                               bins=[-float('inf'), 0, 2, float('inf')],
                               labels=['No children', '1-2 children', '3+ children'])

# Display the new features
print("New features added:")
print(df[['bmi_category', 'health_risk_score', 'smoker_exercise_interaction', 
          'age_group', 'children_category']].head())

New features added:
  bmi_category  health_risk_score  smoker_exercise_interaction    age_group  \
0       Normal              44.90                            1       Senior   
1   Overweight              53.76                            2        Young   
2        Obese              93.76                            2  Middle-Aged   
3       Normal              44.78                            6        Young   
4        Obese              79.42                            3       Senior   

  children_category  
0       3+ children  
1      1-2 children  
2      1-2 children  
3       No children  
4       3+ children  


In [8]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges,bmi_category,health_risk_score,smoker_exercise_interaction,age_group,children_category
0,46,1,21.45,5,1,1,1,1,1,1,1,20460.307669,Normal,44.9,1,Senior,3+ children
1,25,2,25.38,2,1,2,1,2,2,2,1,20390.899218,Overweight,53.76,2,Young,1-2 children
2,38,1,44.88,2,1,3,2,2,2,1,1,20204.476302,Obese,93.76,2,Middle-Aged,1-2 children
3,25,1,19.89,0,2,2,2,3,3,2,2,11789.029843,Normal,44.78,6,Young,No children
4,49,1,38.21,3,1,2,1,2,3,2,2,19268.309838,Obese,79.42,3,Senior,3+ children


In [9]:
# Label Encoding

# 1. BMI Category
df['bmi_category'] = df['bmi_category'].astype('category')
df['bmi_category'] = df['bmi_category'].cat.codes

# 2. Age Group
df['age_group'] = df['age_group'].astype('category')
df['age_group'] = df['age_group'].cat.codes

# 3. Children Category
df['children_category'] = df['children_category'].astype('category')
df['children_category'] = df['children_category'].cat.codes


## Week 6 Assignment Starts Here

In [10]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges,bmi_category,health_risk_score,smoker_exercise_interaction,age_group,children_category
0,46,1,21.45,5,1,1,1,1,1,1,1,20460.307669,1,44.9,1,2,2
1,25,2,25.38,2,1,2,1,2,2,2,1,20390.899218,2,53.76,2,0,1
2,38,1,44.88,2,1,3,2,2,2,1,1,20204.476302,3,93.76,2,1,1
3,25,1,19.89,0,2,2,2,3,3,2,2,11789.029843,1,44.78,6,0,0
4,49,1,38.21,3,1,2,1,2,3,2,2,19268.309838,3,79.42,3,2,2


In [11]:
binary_columns = ['gender', 'smoker']
categorical_cols = ['occupation', 'region', 'coverage_level', 'bmi_category', 'age_group', 'children_category']

In [12]:
for col in binary_columns:
    df[col] = df[col].map({1:0, 2:1})

In [13]:
# Assuming categorical_cols is defined as follows:
categorical_cols = ['occupation', 'region', 'coverage_level', 'bmi_category', 'age_group', 'children_category']

# One-hot encode the categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Convert True/False to 0/1 using map
df_encoded = df_encoded.apply(lambda col: col.map({True: 1, False: 0}) if col.dtype == 'bool' else col)

# Drop the original categorical columns from the original DataFrame
# df_encoded = df_encoded.drop(columns=categorical_cols)

# Display the first few rows of the encoded DataFrame
df_encoded.head()

Unnamed: 0,age,gender,bmi,children,smoker,medical_history,family_medical_history,exercise_frequency,charges,health_risk_score,...,region_4,coverage_level_2,coverage_level_3,bmi_category_1,bmi_category_2,bmi_category_3,age_group_1,age_group_2,children_category_1,children_category_2
0,46,0,21.45,5,0,1,1,1,20460.307669,44.9,...,0,0,0,1,0,0,0,1,0,1
1,25,1,25.38,2,0,1,2,2,20390.899218,53.76,...,0,0,0,0,1,0,0,0,1,0
2,38,0,44.88,2,0,2,2,2,20204.476302,93.76,...,0,0,0,0,0,1,1,0,1,0
3,25,0,19.89,0,1,2,3,3,11789.029843,44.78,...,0,1,0,1,0,0,0,0,0,0
4,49,0,38.21,3,0,1,2,3,19268.309838,79.42,...,0,1,0,0,0,1,0,1,0,1


In [14]:
df_encoded

Unnamed: 0,age,gender,bmi,children,smoker,medical_history,family_medical_history,exercise_frequency,charges,health_risk_score,...,region_4,coverage_level_2,coverage_level_3,bmi_category_1,bmi_category_2,bmi_category_3,age_group_1,age_group_2,children_category_1,children_category_2
0,46,0,21.45,5,0,1,1,1,20460.307669,44.90,...,0,0,0,1,0,0,0,1,0,1
1,25,1,25.38,2,0,1,2,2,20390.899218,53.76,...,0,0,0,0,1,0,0,0,1,0
2,38,0,44.88,2,0,2,2,2,20204.476302,93.76,...,0,0,0,0,0,1,1,0,1,0
3,25,0,19.89,0,1,2,3,3,11789.029843,44.78,...,0,1,0,1,0,0,0,0,0,0
4,49,0,38.21,3,0,1,2,3,19268.309838,79.42,...,0,1,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,59,0,46.67,2,1,3,1,4,11584.134900,97.34,...,1,0,1,0,0,1,0,1,1,0
999996,33,0,36.83,2,1,2,2,4,9834.871456,77.66,...,1,0,1,0,0,1,1,0,1,0
999997,39,0,39.84,0,0,4,2,3,22076.632856,85.68,...,1,1,0,0,0,1,1,0,0,0
999998,37,1,45.06,4,0,3,3,2,20297.618728,96.12,...,1,0,0,0,0,1,1,0,0,1


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming df is your encoded DataFrame
# Separate features and target
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the validation set to a CSV file
validation_set = pd.DataFrame(X_val, columns=X.columns)
validation_set['charges'] = y_val.reset_index(drop=True)
validation_set.to_csv('insurance_data_validation_set.csv', index=False)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Fit the model on the training data
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr_model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the performance metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error (MAE): {mae}")

Mean Squared Error (MSE): 2806212.977554143
R-squared: 0.8555238830816657
Mean Absolute Error (MAE): 1507.1298424909164


In [17]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Calculate performance metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

In [18]:
import joblib
# Print the performance metrics
print(f"Random Forest Regressor - Mean Squared Error (MSE): {mse_rf}")
print(f"Random Forest Regressor - R-squared: {r2_rf}")
print(f"Random Forest Regressor - Mean Absolute Error (MAE): {mae_rf}")

# Save the Random Forest model to a file
joblib.dump(rf_model, 'random_forest_model.joblib')

Random Forest Regressor - Mean Squared Error (MSE): 133051.82136880324
Random Forest Regressor - R-squared: 0.9931499103403653
Random Forest Regressor - Mean Absolute Error (MAE): 299.5975830090038


['random_forest_model.joblib']