In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import math

# Load data from a CSV file
df = pd.read_csv('dataset.csv')

# Apply label encoding to the 'state' column
label_encoder = LabelEncoder()
df['state'] = label_encoder.fit_transform(df['state'])

df.tail()

Unnamed: 0,client_id,sex,age,state,risk_profile,invest_goals,temp_goals,fin_exp,fin_avail,a1,...,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20
1166,5146755,1,69,25,4,14,17,10,19,0.167,...,0.0,0.095,0.0,0.02,0.0,0.0,0.181,0.0,0.0,0.0
1167,5738347,1,56,11,3,13,17,11,19,0.034,...,0.0,0.065,0.0,0.068,0.0,0.0,0.0,0.065,0.049,0.0
1168,8516725,1,64,24,4,13,17,11,19,0.178,...,0.0,0.0,0.0,0.055,0.0,0.0,0.023,0.0,0.0,0.0
1169,14240215,1,57,11,3,13,17,10,19,0.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1170,14326270,0,51,14,3,13,17,11,19,0.184,...,0.0,0.0,0.0,0.006,0.0,0.0,0.018,0.0,0.0,0.0


In [2]:
# 'data' contains columns 'sex', 'age', 'state', 'risk_profile', 'invest_goals', 'temp_goals', 'fin_exp', 'fin_avail'
data = df[['sex', 'age', 'state', 'risk_profile', 'invest_goals', 'temp_goals', 'fin_exp', 'fin_avail']].values

# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

# 'targets' contains columns 'a1' through 'a20'
targets = df[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a20']].values

# Split the data into training and testing sets
split_ratio = 0.7  # 70% for training, 30% for testing
split_index = int(len(data) * split_ratio)

X_train, X_test = data[:split_index], data[split_index:]
y_train, y_test = targets[:split_index], targets[split_index:]

print('X_train shape =', X_train.shape)
print('y_train shape =', y_train.shape)
print('X_test shape =', X_test.shape)
print('y_test shape =', y_test.shape)

X_train shape = (819, 8)
y_train shape = (819, 20)
X_test shape = (352, 8)
y_test shape = (352, 20)


In [3]:
# Create and train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Display the model parameters
coefficients = model.coef_
intercept = model.intercept_

print("Model Coefficients (Weights):")
print(coefficients)
print("\nModel Intercept (Bias):")
print(intercept)

Model Coefficients (Weights):
[[ 5.11593433e-03 -1.08143730e-01  1.52287672e-02 -2.18329066e-01
  -3.78802977e-02 -5.99889953e-02 -5.32554795e-02 -9.09292727e-02]
 [ 6.79636961e-04  3.20868759e-03  3.66857546e-03  2.78723106e-03
  -1.36452257e-03  7.37959313e-03 -7.39963035e-04 -7.14360066e-03]
 [-1.79552072e-02  4.07457047e-02 -6.72166692e-03 -8.96449964e-02
  -6.19747715e-03 -4.38264886e-02  4.81114540e-02  1.18749142e-01]
 [-5.26342119e-03  4.41764313e-02  1.09322176e-03 -3.18575119e-02
   1.17022231e-02  6.07867455e-02 -3.16767233e-02  1.62438899e-02]
 [ 1.30213275e-03  1.42918257e-02  1.36624249e-03  2.79440358e-02
  -2.45807438e-02  8.55493749e-03 -2.18428772e-02  1.27535112e-02]
 [-7.58962874e-05 -1.91857922e-03 -9.26678294e-03  1.13978690e-02
   2.36676039e-03  6.24922873e-03  7.70785699e-03 -1.60344812e-02]
 [ 1.18350144e-03  4.47257808e-03 -5.11330335e-03  2.81866775e-02
  -1.35450987e-02 -1.22143960e-03  8.55578790e-03  3.30185836e-04]
 [ 4.05194363e-03  1.65869724e-02  3.50

In [4]:
# Evaluate the model
y_pred = model.predict(X_test)

print('y_test shape =', y_test.shape)
###print('y_test', y_test)
print('y_pred shape =', y_pred.shape)
###print('y_pred', y_pred)

y_test shape = (352, 20)
y_pred shape = (352, 20)


In [5]:
sums = []
for row in y_test:
    sums.append(sum(row))
#print('y_test =', sums)
print("y_test average =", sum(sums)/len(sums))

y_test average = 1.0000227272727267


In [6]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.000035432747997


In [7]:
# Remove values < 0.01 and Scale predictions to get 1 (100%) as a sum of each row
num_rows = len(y_pred)
num_cols = len(y_pred[0])

for row in range(num_rows):
    for col in range(num_cols):
        if y_pred[row][col] < 0.01:
            y_pred[row][col] = 0

for row in range(num_rows):
    row_sum = sum(y_pred[row])
    for col in range(num_cols):
        y_pred[row][col] = y_pred[row][col] / row_sum

In [8]:
sums = []
for row in y_pred:
    sums.append(sum(row))
#print('y_pred =', sums)
print("y_pred average =", sum(sums)/len(sums))

y_pred average = 1.0


In [9]:
data_for_csv = df[['sex', 'age', 'state', 'risk_profile', 'invest_goals', 'temp_goals', 'fin_exp', 'fin_avail']].values
data_train, data_test = data_for_csv[:split_index], data_for_csv[split_index:]
array1 = data_test
array2 = np.round(np.array(y_pred), 3)

result_array = np.concatenate((array1, array2), axis=1)

np.set_printoptions(suppress=True)
print(result_array)

np.savetxt('MLR_output.csv', result_array, delimiter=",", fmt='%.3f')

[[ 1. 41. 33. ...  0.  0.  0.]
 [ 1. 69. 14. ...  0.  0.  0.]
 [ 1. 46. 28. ...  0.  0.  0.]
 ...
 [ 1. 64. 24. ...  0.  0.  0.]
 [ 1. 57. 11. ...  0.  0.  0.]
 [ 0. 51. 14. ...  0.  0.  0.]]
