In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Load data from a CSV file
df = pd.read_csv('dataset.csv')

# Apply label encoding to the 'state' column
label_encoder = LabelEncoder()
df['state'] = label_encoder.fit_transform(df['state'])

# 'data' contains columns 'sex', 'age', 'state', 'risk_profile'
data = df[['sex', 'age', 'state', 'risk_profile']].values

# 'targets' contains columns 'a1' through 'a20'
targets = df[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20']].values

# Split the data into training and testing sets
split_ratio = 0.7  # 70% for training, 30% for testing
split_index = int(len(data) * split_ratio)

X_train, X_test = data[:split_index], data[split_index:]
y_train, y_test = targets[:split_index], targets[split_index:]

In [2]:
# Create a pipeline with normalization and a Random Forest model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

# Parameters for selection with GridSearchCV
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Model training with automatic selection of hyperparameters
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'rf__max_depth': 10, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 10, 'rf__n_estimators': 200}


In [3]:
# Predict
y_pred = grid_search.predict(X_test)

# RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.07398165411189633


In [4]:
sums = []
for row in y_test:
    sums.append(sum(row))
#print('y_test =', sums)
print("y_test average =", sum(sums)/len(sums))

y_test average = 1.0000227272727267


In [5]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.0000452328332186


In [6]:
# Remove values < 0.01 and Scale predictions to get 1 (100%) as a sum of each row
num_rows = len(y_pred)
num_cols = len(y_pred[0])

for row in range(num_rows):
    for col in range(num_cols):
        if y_pred[row][col] < 0.01:
            y_pred[row][col] = 0

for row in range(num_rows):
    row_sum = sum(y_pred[row])
    for col in range(num_cols):
        y_pred[row][col] = y_pred[row][col] / row_sum

In [7]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.0


In [8]:
data_for_csv = df[['sex', 'age', 'state', 'risk_profile']].values
data_train, data_test = data_for_csv[:split_index], data_for_csv[split_index:]
array1 = data_test
array2 = np.round(np.array(y_pred), 3)

result_array = np.concatenate((array1, array2), axis=1)

np.set_printoptions(suppress=True)
print(result_array)

np.savetxt('HP_RF_output.csv', result_array, delimiter=",", fmt='%.3f')

[[ 1.    41.    33.    ...  0.     0.     0.   ]
 [ 1.    69.    14.    ...  0.     0.     0.   ]
 [ 1.    46.    28.    ...  0.     0.     0.   ]
 ...
 [ 1.    64.    24.    ...  0.     0.     0.   ]
 [ 1.    57.    11.    ...  0.     0.     0.012]
 [ 0.    51.    14.    ...  0.     0.     0.   ]]
