In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import RootMeanSquaredError

# Load data from a CSV file
df = pd.read_csv('dataset.csv')

# Apply label encoding to the 'state' column
label_encoder = LabelEncoder()
df['state'] = label_encoder.fit_transform(df['state'])

# 'data' contains columns 'sex', 'age', 'state', 'risk_profile'
data = df[['sex', 'age', 'state', 'risk_profile']].values

# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

# 'targets' contains columns 'a1' through 'a20'
targets = df[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 
              'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20']].values

# Split the data into training and testing sets
split_ratio = 0.7  # 70% for training, 30% for testing
split_index = int(len(data) * split_ratio)

x_train, x_test = data[:split_index], data[split_index:]
y_train, y_test = targets[:split_index], targets[split_index:]

# Reshape the input data to match the expected input shape of the model
x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

# Define the model creation function for KerasRegressor
def create_model(epochs=25, batch_size=8):
    model = Sequential()
    model.add(LSTM(units=64, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
    model.add(Dense(units=20))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=[RootMeanSquaredError()])
    return model

# Create KerasRegressor
model = KerasRegressor(model=create_model, verbose=0)

# Define the grid search parameters
param_grid = {
    'epochs': [25, 50, 75, 100], 
    'batch_size': [1, 2, 4, 8, 16]
}

# Create and fit the GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
grid_result = grid.fit(x_train, y_train)

# Print the best parameters found
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# Use the best parameters to create the final model
best_epochs = grid_result.best_params_['epochs']
best_batch_size = grid_result.best_params_['batch_size']
final_model = create_model(epochs=best_epochs, batch_size=best_batch_size)

# Train the final model
final_model.fit(x_train, y_train, epochs=best_epochs, batch_size=best_batch_size, validation_data=(x_test, y_test), verbose=0)

# Generate predictions on the test set
y_pred = final_model.predict(x_test)

# Reshape y_test and y_pred to 2D arrays
y_test = y_test.reshape(y_test.shape[0], -1)
y_pred = y_pred.reshape(y_pred.shape[0], -1)

Best: -0.005167 using {'batch_size': 2, 'epochs': 100}


In [2]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.0131589519038575


In [3]:
# Remove values < 0.01 and Scale predictions to get 1 (100%) as a sum of each row
num_rows = len(y_pred)
num_cols = len(y_pred[0])

for row in range(num_rows):
    for col in range(num_cols):
        if y_pred[row][col] < 0.01:
            y_pred[row][col] = 0

for row in range(num_rows):
    row_sum = sum(y_pred[row])
    for col in range(num_cols):
        y_pred[row][col] = y_pred[row][col] / row_sum

In [4]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 0.9999999984813092


In [5]:
data_for_csv = df[['sex', 'age', 'state', 'risk_profile']].values
data_train, data_test = data_for_csv[:split_index], data_for_csv[split_index:]
array1 = data_test
array2 = np.round(np.array(y_pred), 3)

result_array = np.concatenate((array1, array2), axis=1)

np.set_printoptions(suppress=True)
print(result_array)

np.savetxt('HP_LSTM_output.csv', result_array, delimiter=",", fmt='%.3f')

[[ 1.    41.    33.    ...  0.     0.     0.   ]
 [ 1.    69.    14.    ...  0.     0.013  0.   ]
 [ 1.    46.    28.    ...  0.     0.011  0.   ]
 ...
 [ 1.    64.    24.    ...  0.     0.011  0.   ]
 [ 1.    57.    11.    ...  0.     0.012  0.   ]
 [ 0.    51.    14.    ...  0.     0.012  0.   ]]
