In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
final_customer_mkt_data = pd.read_csv('customer_mkt_data_final.csv').drop(columns = ['Unnamed: 0', 'Effective To Date', 'Customer'])

In [3]:
numericals = final_customer_mkt_data.select_dtypes(include = np.number)
categoricals = final_customer_mkt_data.select_dtypes(include = object)
y = numericals['Total Claim Amount']

In [4]:
encoder = OneHotEncoder().fit(categoricals)
categoricals_array = encoder.transform(categoricals).toarray()

In [5]:
# to improve the model, we remove one column of every category to reduce the dimensions of the data keeping the same information.

final_columns = list()
to_drop_columns = list()

for columns in encoder.categories_:
    to_drop_columns.append(columns[0])
    for column in columns.tolist():
        final_columns.append(column)

In [6]:
categoricals_dataframe = pd.DataFrame(data = categoricals_array, columns = final_columns)
categoricals_dataframe = categoricals_dataframe.drop(columns = to_drop_columns)

In [7]:
X = pd.concat([numericals, categoricals_dataframe], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.50, random_state = 42)

In [9]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

LinearRegression()

In [10]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

predictions = linear_regression.predict(X_test)

r2 = r2_score(predictions, y_test)
mse = mean_squared_error(predictions, y_test)
mae = mean_absolute_error(predictions, y_test)

In [11]:
from math import sqrt

print(f'R2: {r2}')
print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'RMSE: {sqrt(mse)}')

R2: 1.0
MSE: 9.631248835358781e-26
MAE: 2.4502104357321644e-13
RMSE: 3.103425339098523e-13
