<a href="https://colab.research.google.com/github/savinthie/Final_Year_Project_IDP_2024-2025/blob/main/Model_2_CNN_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Import necessary libraries
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Flatten, Dense, ReLU, Concatenate, MaxPooling1D,Dropout
from tensorflow.keras.optimizers import Adam
import joblib
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping


In [14]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/FYP 2024 25/USDataset.csv', header=1)
df = df.fillna(0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Data Preprocessing

df.columns = [col.lower().replace(' ', '').replace('.', '') for col in df.columns]
cols_to_keep = ['stateabv', 'county', 'family', 'housing', 'food', 'transportation',
                'healthcare', 'othernecessities', 'childcare', 'taxes', 'total',
                'median_family_income', 'num_counties_in_st']
df1 = df[cols_to_keep].copy()

df1['median_family_income'] = df1['median_family_income'].replace(',', '', regex=True).astype(float)
df1['median_family_income'] = df1['median_family_income'].map(lambda x: x/12)

In [16]:
# Feature Engineering
df1['n_parents'] = df1['family'].str.slice(0, 1).astype(int)
df1['n_children'] = df1['family'].str.slice(2, 3).astype(int)
df1['n_members'] = df1['n_parents'] + df1['n_children']
df1['financial_stability'] = df1['median_family_income'] / df1['total']
df1["per_member_cost"] = df1["total"] / df1["n_members"]
df1["child_expense_cost"] = df1["per_member_cost"]*df1["n_children"]
df1["parent_expense_cost"] = df1["per_member_cost"]*df1["n_parents"]
df1["other_expense_cost"] = df1["total"] - (df1["child_expense_cost"]+df1["parent_expense_cost"])

df1["zero_childcare_cost"] = df1['n_children'].map(lambda x: 0 if x < 1 else 1)

In [17]:
# Splitting the data
X = df1[['total', 'median_family_income', 'num_counties_in_st', 'n_children', 'n_parents', 'n_members']+['per_member_cost','child_expense_cost','parent_expense_cost','other_expense_cost','zero_childcare_cost']].values
y_expenses = df1[['housing', 'food', 'transportation', 'healthcare', 'othernecessities', 'childcare', 'taxes']].values

target_col_list = ['housing', 'food', 'transportation', 'healthcare', 'othernecessities', 'childcare', 'taxes']

# Scaling
scaler_X = MinMaxScaler()

# Load the X scaler
# scaler_X = joblib.load('scaler_X.pkl')

X_scaled = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()

# Load the y scaler
# scaler_y = joblib.load('scaler_y.pkl')
y_exp_scaled = scaler_y.fit_transform(y_expenses)


# Save the X scaler
joblib.dump(scaler_X, 'scaler_X.pkl')

# Save the y scaler
joblib.dump(scaler_y, 'scaler_y.pkl')

['scaler_y.pkl']

In [18]:
def replace_zeros_with_ones(a, b):
    # Iterate through both lists simultaneously
    for i in range(len(a)):
        # If both value is 0, replace both with 1
        if a[i] == 0 and b[i] == 0:
            a[i] = 1
            b[i] = 1
    return a, b

In [19]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def calculateSmape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
    y_true (array-like): Actual values.
    y_pred (array-like): Predicted values.

    Returns:
    float: SMAPE value.
    """
    y_true, y_pred = replace_zeros_with_ones(y_true, y_pred)
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_value = np.mean(numerator / denominator) * 100
    return smape_value

In [20]:
# Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=100)

# Metrics
def print_metrics(y_true, y_pred, task_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{task_name} - MSE: {mse:.4f}, R2: {r2:.4f}")

def regr_report(x, y):
    mae = round(mean_absolute_error(x, y), 4)
    r2 = round(r2_score(x, y), 4)
    mse = round(mean_squared_error(x, y), 4)
    rmse = round(np.sqrt(mean_squared_error(x, y)), 4)
    smape_score = calculateSmape(x, y)
    return f'MAE: {mae}, R-Squared: {r2}, RMSE: {rmse}, MSE: {mse} ,smape: {smape_score}'

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import xgboost as xgb

num_children_col = 3  # 4th position, 0-indexed
childcare_exp_col = target_col_list.index('childcare')  # Replace 'childcare_exp' with the actual target name

In [22]:
from tensorflow.keras.layers import Input, Conv1D, Dropout, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

# L2 regularization strength (you can tune this value)
l2_strength = 0.01

# Hybrid Model Definition
def create_hybrid_model(input_shape, output_shape):
    input_layer = Input(shape=input_shape)
    # CNN part
    x = Conv1D(filters=8, kernel_size=3, padding='same', activation='relu')(input_layer)
    x = Flatten()(x)
    # MLP part
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    # Multiple regression outputs
    output_layers = [Dense(1, activation='linear', name=f'target_{col}')(x) for col in target_col_list]

    model = Model(inputs=input_layer, outputs=output_layers)
    return model

In [24]:
global X_train_global, X_test_global, X_val_global
global y_train_global, y_test_global, y_val_global

In [25]:
# Cross-Validation with Evaluation
def cross_val_with_evaluation(X_scaled, y_exp_scaled, kf, num_epochs=200):
    global X_train_global, X_test_global, X_val_global
    global y_train_global, y_test_global, y_val_global
    fold = 1
    for train_index, val_index in kf.split(X_scaled):
        print(f"Fold {fold}/{kf.get_n_splits()}")
        X_train, X_test_val = X_scaled[train_index], X_scaled[val_index]
        y_train, y_test_val = y_exp_scaled[train_index], y_exp_scaled[val_index]

        X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=100) # 50/50 split of remaining 20%

        X_train_global, X_test_global, X_val_global = X_train, X_test, X_val
        y_train_global, y_test_global, y_val_global = y_train, y_test, y_val

        # Reshaping data for CNN
        X_train = X_train[..., np.newaxis]
        X_val = X_val[..., np.newaxis]
        X_test = X_test[..., np.newaxis]

        model = create_hybrid_model((X_train.shape[1], 1), y_train.shape[1])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

        model.summary()

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Training
        model.fit(X_train, [y_train[:, i] for i in range(y_train.shape[1])],
                  validation_data=(X_val, [y_val[:, i] for i in range(y_val.shape[1])]),
                  epochs=num_epochs, batch_size=64, verbose=1, callbacks= [early_stopping])

        # Predictions
        y_train_pred = np.column_stack(model.predict(X_train))
        y_val_pred = np.column_stack(model.predict(X_val))
        y_test_pred = np.column_stack(model.predict(X_test)) #Prediction on test set

        # Inverse scaling
        y_train_pred_original = scaler_y.inverse_transform(y_train_pred)
        y_train_original = scaler_y.inverse_transform(y_train)
        y_val_pred_original = scaler_y.inverse_transform(y_val_pred)
        y_val_original = scaler_y.inverse_transform(y_val)
        y_test_pred_original = scaler_y.inverse_transform(y_test_pred) #Inverse transform for test set
        y_test_original = scaler_y.inverse_transform(y_test) #Inverse transform for test set

        # Enforce the rule after predictions for childcare expense
        # The issue is likely caused by X_train having an extra dimension due to[..., np.newaxis].
        # We need to select the original features for the condition
        y_train_pred_original[:, childcare_exp_col] = np.where(
            X_train[:, num_children_col, 0] == 0, 0, y_train_pred_original[:, childcare_exp_col]) #Using X_train[:, num_children_col, 0] instead of X_train[:, num_children_col]

        y_val_pred_original[:, childcare_exp_col] = np.where(
            X_val[:, num_children_col, 0] == 0, 0, y_val_pred_original[:, childcare_exp_col]) #Using X_val[:, num_children_col, 0] instead of X_val[:, num_children_col]

        y_test_pred_original[:, childcare_exp_col] = np.where(
            X_test[:, num_children_col, 0] == 0, 0, y_test_pred_original[:, childcare_exp_col]) #Using X_test[:, num_children_col, 0] instead of X_test[:, num_children_col]

        # Metrics for each target
        for i, target in enumerate(target_col_list):
            print(f'Model Results for {target.capitalize()}:')
            print('Train Data:', regr_report(y_train_original[:, i], y_train_pred_original[:, i]))
            print('Validation Data:', regr_report(y_val_original[:, i], y_val_pred_original[:, i]))
            print('Test Data:', regr_report(y_test_original[:, i], y_test_pred_original[:, i])) #Print metrics for test set
            print()

        fold += 1
    return model

model_combine = cross_val_with_evaluation(X_scaled, y_exp_scaled,kf)



Fold 1/5


Epoch 1/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.0629 - target_childcare_loss: 0.0096 - target_food_loss: 0.0058 - target_healthcare_loss: 0.0123 - target_housing_loss: 0.0138 - target_othernecessities_loss: 0.0081 - target_taxes_loss: 0.0049 - target_transportation_loss: 0.0085 - val_loss: 0.0137 - val_target_childcare_loss: 0.0020 - val_target_food_loss: 5.4704e-04 - val_target_healthcare_loss: 0.0048 - val_target_housing_loss: 0.0013 - val_target_othernecessities_loss: 9.0892e-04 - val_target_taxes_loss: 5.2901e-04 - val_target_transportation_loss: 0.0036
Epoch 2/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0138 - target_childcare_loss: 0.0020 - target_food_loss: 6.1131e-04 - target_healthcare_loss: 0.0050 - target_housing_loss: 0.0013 - target_othernecessities_loss: 9.0391e-04 - target_taxes_loss: 5.1930e-04 - target_transportation_loss: 0.0034 - val_loss: 0.0131 - val_target_childcare_loss:

Epoch 1/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.0780 - target_childcare_loss: 0.0120 - target_food_loss: 0.0038 - target_healthcare_loss: 0.0172 - target_housing_loss: 0.0074 - target_othernecessities_loss: 0.0134 - target_taxes_loss: 0.0036 - target_transportation_loss: 0.0207 - val_loss: 0.0141 - val_target_childcare_loss: 0.0022 - val_target_food_loss: 5.2098e-04 - val_target_healthcare_loss: 0.0051 - val_target_housing_loss: 0.0014 - val_target_othernecessities_loss: 9.6183e-04 - val_target_taxes_loss: 6.0226e-04 - val_target_transportation_loss: 0.0035
Epoch 2/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0134 - target_childcare_loss: 0.0020 - target_food_loss: 5.8923e-04 - target_healthcare_loss: 0.0048 - target_housing_loss: 0.0013 - target_othernecessities_loss: 9.0143e-04 - target_taxes_loss: 5.0833e-04 - target_transportation_loss: 0.0033 - val_loss: 0.0135 - val_target_childcare_loss:

Epoch 1/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0629 - target_childcare_loss: 0.0105 - target_food_loss: 0.0040 - target_healthcare_loss: 0.0132 - target_housing_loss: 0.0050 - target_othernecessities_loss: 0.0078 - target_taxes_loss: 0.0045 - target_transportation_loss: 0.0179 - val_loss: 0.0146 - val_target_childcare_loss: 0.0022 - val_target_food_loss: 6.5366e-04 - val_target_healthcare_loss: 0.0053 - val_target_housing_loss: 0.0015 - val_target_othernecessities_loss: 9.8589e-04 - val_target_taxes_loss: 4.5209e-04 - val_target_transportation_loss: 0.0036
Epoch 2/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0138 - target_childcare_loss: 0.0021 - target_food_loss: 6.0127e-04 - target_healthcare_loss: 0.0049 - target_housing_loss: 0.0014 - target_othernecessities_loss: 9.2690e-04 - target_taxes_loss: 4.7794e-04 - target_transportation_loss: 0.0034 - val_loss: 0.0145 - val_target_childcare_loss:

Epoch 1/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 0.0768 - target_childcare_loss: 0.0131 - target_food_loss: 0.0065 - target_healthcare_loss: 0.0152 - target_housing_loss: 0.0057 - target_othernecessities_loss: 0.0063 - target_taxes_loss: 0.0053 - target_transportation_loss: 0.0247 - val_loss: 0.0135 - val_target_childcare_loss: 0.0020 - val_target_food_loss: 5.5072e-04 - val_target_healthcare_loss: 0.0048 - val_target_housing_loss: 0.0013 - val_target_othernecessities_loss: 9.2278e-04 - val_target_taxes_loss: 5.3099e-04 - val_target_transportation_loss: 0.0033
Epoch 2/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0140 - target_childcare_loss: 0.0022 - target_food_loss: 6.4865e-04 - target_healthcare_loss: 0.0049 - target_housing_loss: 0.0014 - target_othernecessities_loss: 9.2600e-04 - target_taxes_loss: 5.0594e-04 - target_transportation_loss: 0.0034 - val_loss: 0.0128 - val_target_childcare_loss:

Epoch 1/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0591 - target_childcare_loss: 0.0170 - target_food_loss: 0.0035 - target_healthcare_loss: 0.0108 - target_housing_loss: 0.0068 - target_othernecessities_loss: 0.0041 - target_taxes_loss: 0.0035 - target_transportation_loss: 0.0135 - val_loss: 0.0141 - val_target_childcare_loss: 0.0021 - val_target_food_loss: 8.1531e-04 - val_target_healthcare_loss: 0.0050 - val_target_housing_loss: 0.0014 - val_target_othernecessities_loss: 8.9179e-04 - val_target_taxes_loss: 4.8319e-04 - val_target_transportation_loss: 0.0034
Epoch 2/200
[1m393/393[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0141 - target_childcare_loss: 0.0021 - target_food_loss: 5.9502e-04 - target_healthcare_loss: 0.0051 - target_housing_loss: 0.0014 - target_othernecessities_loss: 8.9081e-04 - target_taxes_loss: 5.1133e-04 - target_transportation_loss: 0.0035 - val_loss: 0.0131 - val_target_childcare_loss:

      feature1  feature2  feature3  feature4  feature1  feature2  feature3  \
0     0.075586  0.307823  0.470356      0.00       1.0       0.2  0.132114   
1     0.363579  0.418043  0.387352      1.00       1.0       1.0  0.039623   
2     0.097141  0.305326  1.000000      0.00       1.0       0.2  0.165041   
3     0.152030  0.337261  0.320158      0.25       1.0       0.4  0.090396   
4     0.331847  0.241743  0.300395      0.75       1.0       0.8  0.073475   
...        ...       ...       ...       ...       ...       ...       ...   
3138  0.290280  0.296488  0.213439      0.50       1.0       0.6  0.116743   
3139  0.338879  0.269664  0.260870      1.00       0.0       0.8  0.077772   
3140  0.215952  0.346029  0.391304      0.50       0.0       0.4  0.155492   
3141  0.304631  0.382407  0.150198      0.50       1.0       0.6  0.127704   
3142  0.189251  0.466541  0.051383      0.00       1.0       0.2  0.305744   

      feature4  feature1  feature2  feature3  
0     0.000000  

In [42]:
import pandas as pd
import numpy as np
import joblib

# Load the scalers
scaler_X = joblib.load('scaler_X.pkl')
scaler_y = joblib.load('scaler_y.pkl')

def get_prediction(model_combine, input_data, scaler_X, scaler_y, num_children_col, childcare_exp_col):
    """
    Generates predictions for expenses using the combined model.

    Args:
        model_combine: The trained Keras model.
        input_data: Pandas DataFrame of new input data.
        scaler_X: Fitted MinMaxScaler for features.
        scaler_y: Fitted MinMaxScaler for target variables.
        num_children_col: Integer column index for number of children.
        childcare_exp_col: Integer column index for the 'childcare' target.

    Returns:
         Numpy Array: Predicted expense values.
    """

    # Step 1: Create new features
    input_data_with_features = createFeatures(input_data.copy())

    # Step 2: Select relevant features
    X = input_data_with_features[['total', 'median_family_income', 'num_counties_in_st', 'n_children',
                                  'n_parents', 'n_members', 'per_member_cost', 'child_expense_cost',
                                  'parent_expense_cost', 'other_expense_cost','zero_childcare_cost']].values

    # Step 3: Scale the input data using the fitted scaler_X
    input_data_scaled = scaler_X.transform(X)

    # Step 4: Reshape for the CNN layer (add the extra dimension)
    input_data_scaled = input_data_scaled[..., np.newaxis]  # This line is crucial to match the training data shape

    # Step 5: Make predictions using the trained model
    # We need to unpack the predictions from the model output
    # Then, we can stack those into a single output array.
    input_data_pred = model_combine.predict(input_data_scaled)
    input_data_pred = np.column_stack(input_data_pred)

    # Step 6: Inverse transform the predictions to return them to the original scale
    input_data_pred_original = scaler_y.inverse_transform(input_data_pred)

    # Step 7: Enforce the rule that if there are no children, childcare cost is 0
    input_data_pred_original[:, childcare_exp_col] = np.where(
        input_data_with_features['n_children'].values == 0, 0, input_data_pred_original[:, childcare_exp_col]
    )

    return input_data_pred_original

def createFeatures(input_data):
    """
    Creates additional features needed for the model prediction.

    Args:
        input_data: The input pandas DataFrame with the original data.

    Returns:
        input_data: The input data with additional features.
    """
    # Step 1: Create additional features
    input_data["per_member_cost"] = input_data["total"] / input_data["n_members"]
    input_data["child_expense_cost"] = input_data["per_member_cost"] * input_data["n_children"]
    input_data["parent_expense_cost"] = input_data["per_member_cost"] * input_data["n_parents"]
    input_data["other_expense_cost"] = input_data["total"] - (input_data["child_expense_cost"] + input_data["parent_expense_cost"])

    # Step 2: Create binary column for whether there are children (0 if no children, 1 if there are children)
    input_data["zero_childcare_cost"] = input_data['n_children'].map(lambda x: 0 if x < 1 else 1)

    return input_data

# Example of input data to test predictions
input_data = pd.DataFrame(
    [[4541, 8790.0, 67, 0, 2, 2]],
    columns=['total', 'median_family_income', 'num_counties_in_st', 'n_children', 'n_parents', 'n_members']
)

# Assuming `model_combine` is your trained model
input_data_pred_original = get_prediction(model_combine, input_data, scaler_X, scaler_y, num_children_col=3, childcare_exp_col=5)

# Print predictions
target_col_list = ['housing', 'food', 'transportation', 'healthcare', 'othernecessities', 'childcare', 'taxes']
for i, prediction in enumerate(input_data_pred_original[0]):
    print(f"{target_col_list[i]}: {prediction:.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
housing: 681.63
food: 610.70
transportation: 1352.26
healthcare: 900.27
othernecessities: 464.47
childcare: 0.00
taxes: 579.26
