In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import math

# Load data from a CSV file
df = pd.read_csv('dataset.csv')

# Apply label encoding to the 'state' column
label_encoder = LabelEncoder()
df['state'] = label_encoder.fit_transform(df['state'])

df.tail()

Unnamed: 0,client_id,sex,age,state,risk_profile,invest_goals,temp_goals,fin_exp,fin_avail,a1,...,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20
1166,5146755,1,69,25,4,14,17,10,19,0.167,...,0.0,0.095,0.0,0.02,0.0,0.0,0.181,0.0,0.0,0.0
1167,5738347,1,56,11,3,13,17,11,19,0.034,...,0.0,0.065,0.0,0.068,0.0,0.0,0.0,0.065,0.049,0.0
1168,8516725,1,64,24,4,13,17,11,19,0.178,...,0.0,0.0,0.0,0.055,0.0,0.0,0.023,0.0,0.0,0.0
1169,14240215,1,57,11,3,13,17,10,19,0.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1170,14326270,0,51,14,3,13,17,11,19,0.184,...,0.0,0.0,0.0,0.006,0.0,0.0,0.018,0.0,0.0,0.0


In [2]:
# 'data' contains columns 'sex', 'age', 'state', 'risk_profile'
data = df[['sex', 'age', 'state', 'risk_profile']].values

# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

# 'targets' contains columns 'a1' through 'a20'
targets = df[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20']].values

# Split the data into training and testing sets
split_ratio = 0.7  # 70% for training, 30% for testing
split_index = int(len(data) * split_ratio)

X_train, X_test = data[:split_index], data[split_index:]
y_train, y_test = targets[:split_index], targets[split_index:]

print('X_train shape =', X_train.shape)
print('y_train shape =', y_train.shape)
print('X_test shape =', X_test.shape)
print('y_test shape =', y_test.shape)

X_train shape = (819, 4)
y_train shape = (819, 20)
X_test shape = (352, 4)
y_test shape = (352, 20)


In [3]:
# Create and train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Display the model parameters
coefficients = model.coef_
intercept = model.intercept_

print("Model Coefficients (Weights):")
print(coefficients)
print("\nModel Intercept (Bias):")
print(intercept)

Model Coefficients (Weights):
[[ 3.84071417e-03 -1.15448903e-01  1.52802875e-02 -3.02925663e-01]
 [ 8.77513255e-04  2.98555218e-03  3.46316070e-03  3.79709380e-03]
 [-1.87962844e-02  4.40176624e-02 -4.94545209e-03 -8.19479449e-02]
 [-4.68489973e-03  4.69298281e-02  1.55984283e-03 -8.42631000e-03]
 [ 1.22953422e-03  1.33333207e-02  2.10457450e-03  4.89399375e-03]
 [ 2.62983913e-04 -2.27828258e-03 -9.85546699e-03  1.73326559e-02]
 [ 1.39341201e-03  3.51550034e-03 -5.28346729e-03  2.05382279e-02]
 [ 4.62044823e-03  1.74563507e-02  2.91920640e-03  3.33045370e-02]
 [ 8.13169204e-03  1.90538593e-02  7.72357589e-03  1.31189628e-01]
 [ 2.38897931e-03 -8.03831021e-03  3.13651955e-03  1.04340135e-01]
 [ 6.61217664e-05  6.55895962e-04 -9.09855694e-04  2.06489543e-03]
 [ 4.18140528e-03 -1.63306164e-02 -4.52739988e-03  6.44111757e-02]
 [ 7.16425336e-04  4.45403281e-04 -2.35520169e-03  1.17123861e-02]
 [ 1.42441410e-03 -7.30176248e-03  3.27143234e-03  1.82585864e-02]
 [ 1.25370113e-03  7.70615135e-0

In [4]:
# Evaluate the model
y_pred = model.predict(X_test)

print('y_test shape =', y_test.shape)
###print('y_test', y_test)
print('y_pred shape =', y_pred.shape)
###print('y_pred', y_pred)

y_test shape = (352, 20)
y_pred shape = (352, 20)


In [5]:
sums = []
for row in y_test:
    sums.append(sum(row))
#print('y_test =', sums)
print("y_test average =", sum(sums)/len(sums))

y_test average = 1.0000227272727267


In [6]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.0000360992149215


In [7]:
# Remove values < 0.01 and Scale predictions to get 1 (100%) as a sum of each row
num_rows = len(y_pred)
num_cols = len(y_pred[0])

for row in range(num_rows):
    for col in range(num_cols):
        if y_pred[row][col] < 0.01:
            y_pred[row][col] = 0

for row in range(num_rows):
    row_sum = sum(y_pred[row])
    for col in range(num_cols):
        y_pred[row][col] = y_pred[row][col] / row_sum

In [8]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.0


In [9]:
data_for_csv = df[['sex', 'age', 'state', 'risk_profile']].values
data_train, data_test = data_for_csv[:split_index], data_for_csv[split_index:]
array1 = data_test
array2 = np.round(np.array(y_pred), 3)

result_array = np.concatenate((array1, array2), axis=1)

np.set_printoptions(suppress=True)
print(result_array)

np.savetxt('MLR_output.csv', result_array, delimiter=",", fmt='%.3f')

[[ 1. 41. 33. ...  0.  0.  0.]
 [ 1. 69. 14. ...  0.  0.  0.]
 [ 1. 46. 28. ...  0.  0.  0.]
 ...
 [ 1. 64. 24. ...  0.  0.  0.]
 [ 1. 57. 11. ...  0.  0.  0.]
 [ 0. 51. 14. ...  0.  0.  0.]]
