<a href="https://colab.research.google.com/github/savinthie/Final_Year_Project_IDP_2024-2025/blob/main/Model_2_CNN_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Flatten, Dense, ReLU, Concatenate, MaxPooling1D,Dropout
from tensorflow.keras.optimizers import Adam
import joblib
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping


In [2]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/FYP 2024 25/USDataset.csv', header=1)
df = df.fillna(0)

Mounted at /content/drive


In [3]:
# Data Preprocessing

df.columns = [col.lower().replace(' ', '').replace('.', '') for col in df.columns]
cols_to_keep = ['stateabv', 'county', 'family', 'housing', 'food', 'transportation',
                'healthcare', 'othernecessities', 'childcare', 'taxes', 'total',
                'median_family_income', 'num_counties_in_st']
df1 = df[cols_to_keep].copy()

df1['median_family_income'] = df1['median_family_income'].replace(',', '', regex=True).astype(float)
df1['median_family_income'] = df1['median_family_income'].map(lambda x: x/12)

In [4]:
# Feature Engineering
df1['n_parents'] = df1['family'].str.slice(0, 1).astype(int)
df1['n_children'] = df1['family'].str.slice(2, 3).astype(int)
df1['n_members'] = df1['n_parents'] + df1['n_children']
df1['financial_stability'] = df1['median_family_income'] / df1['total']
df1["per_member_cost"] = df1["total"] / df1["n_members"]
df1["child_expense_cost"] = df1["per_member_cost"]*df1["n_children"]
df1["parent_expense_cost"] = df1["per_member_cost"]*df1["n_parents"]
df1["other_expense_cost"] = df1["total"] - (df1["child_expense_cost"]+df1["parent_expense_cost"])

df1["zero_childcare_cost"] = df1['n_children'].map(lambda x: 0 if x < 1 else 1)

In [5]:
# Splitting the data
X = df1[['total', 'median_family_income', 'num_counties_in_st', 'n_children', 'n_parents', 'n_members']+['per_member_cost','child_expense_cost','parent_expense_cost','other_expense_cost','zero_childcare_cost']].values
y_expenses = df1[['housing', 'food', 'transportation', 'healthcare', 'othernecessities', 'childcare', 'taxes']].values

target_col_list = ['housing', 'food', 'transportation', 'healthcare', 'othernecessities', 'childcare', 'taxes']

# Scaling
scaler_X = MinMaxScaler()

# Load the X scaler
# scaler_X = joblib.load('scaler_X.pkl')

X_scaled = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()

# Load the y scaler
# scaler_y = joblib.load('scaler_y.pkl')
y_exp_scaled = scaler_y.fit_transform(y_expenses)


# Save the X scaler
joblib.dump(scaler_X, 'scaler_X.pkl')

# Save the y scaler
joblib.dump(scaler_y, 'scaler_y.pkl')

['scaler_y.pkl']

In [6]:
def replace_zeros_with_ones(a, b):
    # Iterate through both lists simultaneously
    for i in range(len(a)):
        # If both value is 0, replace both with 1
        if a[i] == 0 and b[i] == 0:
            a[i] = 1
            b[i] = 1
    return a, b

In [7]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def calculateSmape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
    y_true (array-like): Actual values.
    y_pred (array-like): Predicted values.

    Returns:
    float: SMAPE value.
    """
    y_true, y_pred = replace_zeros_with_ones(y_true, y_pred)
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_value = np.mean(numerator / denominator) * 100
    return smape_value

In [8]:
# Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=100)

# Metrics
def print_metrics(y_true, y_pred, task_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{task_name} - MSE: {mse:.4f}, R2: {r2:.4f}")

def regr_report(x, y):
    mae = round(mean_absolute_error(x, y), 4)
    r2 = round(r2_score(x, y), 4)
    mse = round(mean_squared_error(x, y), 4)
    rmse = round(np.sqrt(mean_squared_error(x, y)), 4)
    smape_score = calculateSmape(x, y)
    return f'MAE: {mae}, R-Squared: {r2}, RMSE: {rmse}, MSE: {mse} ,smape: {smape_score}'

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import xgboost as xgb

num_children_col = 3  # 4th position, 0-indexed
childcare_exp_col = target_col_list.index('childcare')  # Replace 'childcare_exp' with the actual target name

In [10]:
from tensorflow.keras.layers import Input, Conv1D, Dropout, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

# L2 regularization strength (you can tune this value)
l2_strength = 0.01

# Hybrid Model Definition
def create_hybrid_model(input_shape, output_shape):
    input_layer = Input(shape=input_shape)
    # CNN part
    x = Conv1D(filters=8, kernel_size=3, padding='same', activation='relu')(input_layer)
    x = Flatten()(x)
    # MLP part
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    # Multiple regression outputs
    output_layers = [Dense(1, activation='linear', name=f'target_{col}')(x) for col in target_col_list]

    model = Model(inputs=input_layer, outputs=output_layers)
    return model

In [11]:
# Cross-Validation with Evaluation
def cross_val_with_evaluation(X_scaled, y_exp_scaled, kf, num_epochs=200):
    fold = 1
    for train_index, val_index in kf.split(X_scaled):
        print(f"Fold {fold}/{kf.get_n_splits()}")
        X_train, X_test_val = X_scaled[train_index], X_scaled[val_index]
        y_train, y_test_val = y_exp_scaled[train_index], y_exp_scaled[val_index]

        X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=100) # 50/50 split of remaining 20%

        # Reshaping data for CNN
        X_train = X_train[..., np.newaxis]
        X_val = X_val[..., np.newaxis]
        X_test = X_test[..., np.newaxis]

        model = create_hybrid_model((X_train.shape[1], 1), y_train.shape[1])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

        model.summary()

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Training
        model.fit(X_train, [y_train[:, i] for i in range(y_train.shape[1])],
                  validation_data=(X_val, [y_val[:, i] for i in range(y_val.shape[1])]),
                  epochs=num_epochs, batch_size=64, verbose=1, callbacks= [early_stopping])

        # Predictions
        y_train_pred = np.column_stack(model.predict(X_train))
        y_val_pred = np.column_stack(model.predict(X_val))
        y_test_pred = np.column_stack(model.predict(X_test)) #Prediction on test set

        # Inverse scaling
        y_train_pred_original = scaler_y.inverse_transform(y_train_pred)
        y_train_original = scaler_y.inverse_transform(y_train)
        y_val_pred_original = scaler_y.inverse_transform(y_val_pred)
        y_val_original = scaler_y.inverse_transform(y_val)
        y_test_pred_original = scaler_y.inverse_transform(y_test_pred) #Inverse transform for test set
        y_test_original = scaler_y.inverse_transform(y_test) #Inverse transform for test set

        # Enforce the rule after predictions for childcare expense
        # The issue is likely caused by X_train having an extra dimension due to[..., np.newaxis].
        # We need to select the original features for the condition
        y_train_pred_original[:, childcare_exp_col] = np.where(
            X_train[:, num_children_col, 0] == 0, 0, y_train_pred_original[:, childcare_exp_col]) #Using X_train[:, num_children_col, 0] instead of X_train[:, num_children_col]

        y_val_pred_original[:, childcare_exp_col] = np.where(
            X_val[:, num_children_col, 0] == 0, 0, y_val_pred_original[:, childcare_exp_col]) #Using X_val[:, num_children_col, 0] instead of X_val[:, num_children_col]

        y_test_pred_original[:, childcare_exp_col] = np.where(
            X_test[:, num_children_col, 0] == 0, 0, y_test_pred_original[:, childcare_exp_col]) #Using X_test[:, num_children_col, 0] instead of X_test[:, num_children_col]

        # Metrics for each target
        for i, target in enumerate(target_col_list):
            print(f'Model Results for {target.capitalize()}:')
            print('Train Data:', regr_report(y_train_original[:, i], y_train_pred_original[:, i]))
            print('Validation Data:', regr_report(y_val_original[:, i], y_val_pred_original[:, i]))
            print('Test Data:', regr_report(y_test_original[:, i], y_test_pred_original[:, i])) #Print metrics for test set
            print()

        fold += 1
    return model

model_combine = cross_val_with_evaluation(X_scaled, y_exp_scaled,kf)



Fold 1/2


Epoch 1/200
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 0.0585 - target_childcare_loss: 0.0085 - target_food_loss: 0.0030 - target_healthcare_loss: 0.0155 - target_housing_loss: 0.0048 - target_othernecessities_loss: 0.0159 - target_taxes_loss: 0.0038 - target_transportation_loss: 0.0069 - val_loss: 0.0146 - val_target_childcare_loss: 0.0022 - val_target_food_loss: 6.7011e-04 - val_target_healthcare_loss: 0.0053 - val_target_housing_loss: 0.0014 - val_target_othernecessities_loss: 9.7052e-04 - val_target_taxes_loss: 5.6787e-04 - val_target_transportation_loss: 0.0035
Epoch 2/200
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 0.0135 - target_childcare_loss: 0.0021 - target_food_loss: 6.3433e-04 - target_healthcare_loss: 0.0048 - target_housing_loss: 0.0013 - target_othernecessities_loss: 8.8248e-04 - target_taxes_loss: 5.8039e-04 - target_transportation_loss: 0.0032 - val_loss: 0.0140 - val_target_childcare_lo

Epoch 1/200
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.0729 - target_childcare_loss: 0.0108 - target_food_loss: 0.0057 - target_healthcare_loss: 0.0154 - target_housing_loss: 0.0049 - target_othernecessities_loss: 0.0114 - target_taxes_loss: 0.0042 - target_transportation_loss: 0.0205 - val_loss: 0.0142 - val_target_childcare_loss: 0.0021 - val_target_food_loss: 5.8006e-04 - val_target_healthcare_loss: 0.0050 - val_target_housing_loss: 0.0015 - val_target_othernecessities_loss: 9.3677e-04 - val_target_taxes_loss: 7.7950e-04 - val_target_transportation_loss: 0.0032
Epoch 2/200
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0143 - target_childcare_loss: 0.0021 - target_food_loss: 5.7723e-04 - target_healthcare_loss: 0.0051 - target_housing_loss: 0.0014 - target_othernecessities_loss: 9.0864e-04 - target_taxes_loss: 6.0912e-04 - target_transportation_loss: 0.0035 - val_loss: 0.0132 - val_target_childcare_loss:

In [12]:
import pandas as pd
import numpy as np
import joblib

# Load the scalers
scaler_X = joblib.load('scaler_X.pkl')
scaler_y = joblib.load('scaler_y.pkl')

def get_prediction(model, input_data):
    # Step 1: Preprocess input data (same as training)
    input_data["per_member_cost"] = input_data["total"] / input_data["n_members"]
    input_data["child_expense_cost"] = input_data["per_member_cost"] * input_data["n_children"]
    input_data["parent_expense_cost"] = input_data["per_member_cost"] * input_data["n_parents"]
    input_data["other_expense_cost"] = input_data["total"] - (
        input_data["child_expense_cost"] + input_data["parent_expense_cost"]
    )

    # Convert input data to numpy array
    input_data_val = input_data[['total', 'median_family_income', 'num_counties_in_st', 'n_children', 'n_parents',
                                 'n_members', 'per_member_cost', 'child_expense_cost', 'parent_expense_cost', 'other_expense_cost']].values

    # Step 2: Scale the input data using the pre-fitted scaler_X
    input_data_val_scaled = scaler_X.transform(input_data_val)

    # Ensure model expects input in the right shape (if using CNN or RNN, it might need reshaping)
    input_data_val_scaled = input_data_val_scaled[..., np.newaxis]  # If needed

    # Step 3: Make predictions using the trained model
    raw_predictions = model.predict(input_data_val_scaled)

    # Step 3.5: Reshape raw_predictions before inverse scaling
    raw_predictions = np.squeeze(raw_predictions, axis=-1)  # Remove the extra dimension
    raw_predictions = np.column_stack(raw_predictions) # Reshape to 2D

    # Step 4: Inverse transform the predictions to return them to the original scale
    input_data_pred_original = scaler_y.inverse_transform(raw_predictions)

    return input_data_pred_original

# Example of input data to test predictions
input_data = pd.DataFrame(
    [[3803, 87000, 67, 0, 1, 1]],
    columns=['total', 'median_family_income', 'num_counties_in_st', 'n_children', 'n_parents', 'n_members']
)

input_data_pred_original = get_prediction(model, input_data)

# Print predictions
target_col_list = ['housing', 'food', 'transportation', 'healthcare', 'othernecessities', 'childcare', 'taxes']
for i, prediction in enumerate(input_data_pred_original[0]):
    print(f"{target_col_list[i]}: {prediction:.2f}")


NameError: name 'model' is not defined