#### 1. Data Processing

##### 1.1 X-y split.

In order to do the X-y split, we need to figure out the inputs and outputs of our model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Find more information about the dataset
df = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')
print(df.info())
print(df.shape)
print(df.columns)

# Run the transformations from the previous lab

# 1. Standardize column names
df.rename(columns = {'EmploymentStatus': 'Employment Status'}, inplace = True)
df.columns = df.columns.str.lower()

# 2. Remove columns that are highly correlated to each other
df.drop(['policy', 'vehicle size'], axis=1, inplace=True)

We will assume that the `total claim amount` is the output we're looking to predict, as for an insurance policy company it would be relevant to know which customer type is more likely to make claims - so that they can perhaps change the insurance policy pricing for customers that would be considered "high-risk", i.e. more likely to make claims.

In [None]:
y = pd.DataFrame(df['total claim amount'])
X = df.drop('total claim amount', axis=1)

# Check that the operations ran correctly
print(y.columns)
print(X.columns)

##### 1.2. Normalize (numerical).

We need to separate the numerical columns in X from the categorical columns so we can normalize the data at once:

In [None]:
X_num = X.select_dtypes(include=np.number)

# Check that we have selected the correct data
print(X_num.info())

Now we can normalize the data using `MinMaxScaler`:

In [None]:
# Compute the minimum and maximum for each column of the dataframe:
transformer = MinMaxScaler().fit(X_num) 

# Find out what the transformer is:
print(type(transformer))

# Show the maximum across all columns (mainly to see what the info in the transformer):
print(transformer.data_max_)

# Normalize the data (or transform):
x_minmax = transformer.transform(X_num)
print(type(x_minmax))
print(x_minmax.shape)

# Transform the numpy array into the normalized dataframe 
X_num_norm = pd.DataFrame(x_minmax, columns=X_num.columns)
print(X_num_norm.head())

##### 1.3. One Hot/Label Encoding (categorical).

In [None]:
# Create a dataframe with the categorical values
X_cat = X.select_dtypes(include=np.object)
X_cat.drop('customer', axis=1, inplace=True)

# Check that we selected the right data
print(X_cat.info())

encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)
print(type(encoder.categories_))
print(encoder.get_feature_names_out())

# Extract the encoded array from the encoder
encoded = encoder.transform(X_cat).toarray()

# Transform the numpy array to a Pandas dataframe
cat_encoded = pd.DataFrame(encoded)

# Add column names to the dataframe
cat_encoded.columns = encoder.get_feature_names_out()

# Check the encoded dataframe
print(cat_encoded.head())

##### 1.5. Concat DataFrames

In [None]:
X = pd.concat([X_num_norm, cat_encoded], axis=1)

print(X.head())

#### 2. Linear Regression

##### 2.1. Train-test split.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##### 2.2. Apply linear regression.

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# Get the predictions before describing the model:
predictions  = model.predict(X_test)

# Learn more about the predictions:
print(predictions.shape)
print(type(predictions))

#### 3. Model Validation

Description: R2, MSE, RMSE, MAE.

In [None]:
r2 = r2_score(y_test, predictions)
RMSE = mean_squared_error(y_test, predictions, squared=False)
MSE = mean_squared_error(y_test, predictions)
MAE = np.mean(abs(y_test.to_numpy() - predictions))

print("r2 = ", r2)
print("RMSE = ", RMSE)
print("MSE = ", MSE)
print("MAE = ", MAE)

median_total_claim = np.median(y_test.to_numpy())
print("Median Total Claim = ", median_total_claim)

print(RMSE * 100 / median_total_claim)

The r2 score is relatively high, which means that the model is decent at predicting the total claim value. 