#### 1. Data Processing

##### 1.1 X-y split.

In order to do the X-y split, we need to figure out the inputs and outputs of our model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from scipy import stats

# Find more information about the dataset
df = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')
#print(df.info())
#print(df.shape)
#print(df.columns)

# Run the transformations from the previous lab

# 1. Standardize column names
df.rename(columns = {'EmploymentStatus': 'Employment Status'}, inplace = True)
df.columns = df.columns.str.lower()

# 2. Remove columns that are highly correlated to each other
df.drop(['policy', 'vehicle size'], axis=1, inplace=True)

#print(len(df['effective to date'].unique()))

We will assume that the `total claim amount` is the output we're looking to predict, as for an insurance policy company it would be relevant to know which customer type is more likely to make claims - so that they can perhaps change the insurance policy pricing for customers that would be considered "high-risk", i.e. more likely to make claims.

In [None]:
y = pd.DataFrame(df['total claim amount'])
X = df.drop('total claim amount', axis=1)

# Check that the operations ran correctly
#print(y.columns)
#print(X.columns)

# Change y column to numerical data
y = y.apply(pd.to_numeric, errors='ignore')

##### 1.2. Normalize (numerical).

We need to separate the numerical columns in X from the categorical columns so we can normalize the data at once:

In [None]:
X_num = X.select_dtypes(include=np.number)

# Check that we have selected the correct data
#print(X_num.info())

Now we can normalize the data using `MinMaxScaler`:

In [None]:
# Compute the minimum and maximum for each column of the dataframe:
transformer = MinMaxScaler().fit(X_num) 

# Find out what the transformer is:
#print(type(transformer))

# Show the maximum across all columns (mainly to see what the info in the transformer):
#print(transformer.data_max_)

# Normalize the data (or transform):
x_minmax = transformer.transform(X_num)
#print(type(x_minmax))
#print(x_minmax.shape)

# Transform the numpy array into the normalized dataframe 
X_num_norm = pd.DataFrame(x_minmax, columns=X_num.columns)
#print(X_num_norm.head())

##### 1.3. One Hot/Label Encoding (categorical).

In [None]:
# Create a dataframe with the categorical values
X_cat = X.select_dtypes(include='object')
X_cat.drop(['customer', 'effective to date'], axis=1, inplace=True)

# Check that we selected the right data
#print(X_cat.info())

encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)
#print(type(encoder.categories_))
#print(encoder.get_feature_names_out())

# Extract the encoded array from the encoder
encoded = encoder.transform(X_cat).toarray()
#print(type(encoded))

# Transform the numpy array to a Pandas dataframe
cat_encoded = pd.DataFrame(encoded)

# Add column names to the dataframe
cat_encoded.columns = encoder.get_feature_names_out()

# Check the encoded dataframe
#print(cat_encoded.head())
#print(len(encoder.get_feature_names_out()))

##### 1.5. Concat DataFrames

In [None]:
X = pd.concat([X_num_norm, cat_encoded], axis=1)

#print(X.shape)
#print(X.head())

#### 2. Linear Regression

##### 2.1. Train-test split.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##### 2.2. Apply linear regression.

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# Get the predictions before describing the model:
predictions  = model.predict(X_test)

# Learn more about the predictions:
#print(predictions.shape)
#print(type(predictions))

#### 3. Model Validation

Description: R2, MSE, RMSE, MAE.

In [None]:
r2 = r2_score(y_test, predictions)
RMSE = mean_squared_error(y_test, predictions, squared=False)
MSE = mean_squared_error(y_test, predictions)
MAE = np.mean(abs(y_test.to_numpy() - predictions))

print("r2 = ", r2)
print("RMSE = ", RMSE)
print("MSE = ", MSE)
print("MAE = ", MAE)

median_total_claim = np.median(y_test.to_numpy())
print("Median Total Claim = ", median_total_claim)

print(RMSE * 100 / median_total_claim)

The r2 score is relatively high, which means that the model is decent at predicting the total claim value. 

### 4. Modelling

Try to improve the linear regression model. We'll try the following methods to improve the model:

1. Remove the outliers in the numerical data.
2. Try the Box-Cox method on columns with more than 200 unique values.
3. Try the Box-Cox method on columns with more than 500 unique values.
4. Remove the outliers in the numerical data & apply the 2. and 3. Box-Cox methods.

In [None]:
# We will firstly create functions to run all of the steps we did previously for the linear
# regression, so we can easily re-run the model when using different methods:

def split_data(df):
    y = df['total claim amount']
    y = y.apply(pd.to_numeric, errors='ignore')
    X = df.drop('total claim amount', axis=1)
    X_num = X.select_dtypes(include=np.number)
    X_cat = X.select_dtypes(include='object')
    return X, y, X_num, X_cat

def normalize_data(X_num):
    transformer = MinMaxScaler().fit(X_num) 
    x_minmax = transformer.transform(X_num)
    X_num_norm = pd.DataFrame(x_minmax, columns=X_num.columns)
    return X_num_norm

def encode_data(X_cat):
    X_cat.drop(['customer', 'effective to date'], axis=1, inplace=True)
    encoder = OneHotEncoder(handle_unknown='error', drop='first')
    encoder.fit(X_cat)
    encoded = encoder.transform(X_cat).toarray()
    cat_encoded = pd.DataFrame(encoded)
    cat_encoded.columns = encoder.get_feature_names_out()
    return cat_encoded

def run_regression(X_num_norm, cat_encoded, y):
    X = pd.concat([X_num_norm, cat_encoded], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions  = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    RMSE = mean_squared_error(y_test, predictions, squared=False)
    MSE = mean_squared_error(y_test, predictions)
    MAE = np.mean(abs(y_test.to_numpy() - predictions))
    print("r2 = ", r2)
    print("RMSE = ", RMSE)
    print("MSE = ", MSE)
    print("MAE = ", MAE)


#### 4.1. Remove the outliers.

In [None]:
def remove_outliers(df, threshold=1.5, in_columns=df.select_dtypes(np.number).columns, 
                    skip_columns=[]):
    for column in in_columns:
        if column not in skip_columns:
            upper = np.percentile(df[column], 75)
            lower = np.percentile(df[column], 25)
            iqr = upper - lower
            upper_limit = upper + threshold * iqr
            lower_limit = lower - threshold * iqr
            df = df[(df[column]>lower_limit) & (df[column]<upper_limit)]
    return df

df1 = df.copy()
df1 = remove_outliers(df1, in_columns=df1.select_dtypes(np.number).columns, 
                         skip_columns=['number of open complaints', 'number of policies',
                         'months since policy inception', 'months since last claim']) 

X1, y1, X1_num, X1_cat = split_data(df1)
X1_num_norm = normalize_data(X1_num)
cat1_encoded = encode_data(X1_cat)
run_regression(X1_num_norm, cat1_encoded, y1) 


We saw that the model ran slightly better after removing the outliers.

#### 4.2. Box Cox transformation 1

In [None]:
# To apply the Box Cox transformation, we need to use data that is: positive & non-constant.
# Therefore, we need to apply it only on columns that don't have discrete values
# and clean those columns from negative values

# Copy & split the dataframe
df2 = df.copy()
X2, y2, X2_num, X2_cat = split_data(df2)

# We should look at the number of unique values in each column:
for column in X2_num:
    print(column, len(X2_num[column].unique()))

# We can see that customer lifetime value, income and monthly premium auto seem to be 
# non-discrete values. However, the latter seems to have only 202 unique values, so we will
# try the Box Cox tranformation with and without it.

# Box Cox transformation includes monthly premium auto
for column in X2_num:
    # Select the columns with continuous data
    if len(X2_num[column].unique()) > 200:
        # Replace negative data with null values, then replace those with the mean
        X2_num[column] = np.where(X2_num[column]<=0, np.nan, X2_num[column])        
        X2_num[column] = X2_num[column].fillna(X2_num[column].mean())
        X2_num[column], _ = stats.boxcox(X2_num[column])

X2_num_norm = normalize_data(X2_num)
cat2_encoded = encode_data(X2_cat)
run_regression(X2_num_norm, cat2_encoded, y2)

We can see that the model performs worse when the Box Cox transformation was applied to all three columns.

#### 4.3. Box Cox transformation 2

In [None]:
# Copy & split the dataframe
df3 = df.copy()
X3, y3, X3_num, X3_cat = split_data(df3)

# Box Cox transformation excludes monthly premium auto
for column in X3_num:
    # Select the columns with continuous data
    if len(X3_num[column].unique()) > 500:
        # Replace negative data with null values, then replace those with the mean
        X3_num[column] = np.where(X3_num[column]<=0, np.nan, X3_num[column])        
        X3_num[column] = X3_num[column].fillna(X3_num[column].mean())
        X3_num[column], _ = stats.boxcox(X3_num[column])

X3_num_norm = normalize_data(X3_num)
cat3_encoded = encode_data(X3_cat)
run_regression(X3_num_norm, cat3_encoded, y3)

The model performs roughly the same with and without the Box Cox transformation 2.

#### 4.4. Remove outliers + Box Cox transformations

In [None]:
# Copy & split the dataframe
df4 = df.copy()
df4 = remove_outliers(df4, in_columns=df4.select_dtypes(np.number).columns, 
                         skip_columns=['number of open complaints', 'number of policies',
                         'months since policy inception', 'months since last claim']) 
X4, y4, X4_num, X4_cat = split_data(df4)

# Box Cox transformation includes monthly premium auto
for column in X4_num:
    # Select the columns with continuous data
    if len(X4_num[column].unique()) > 200:
        # Replace negative data with null values, then replace those with the mean
        X4_num[column] = np.where(X4_num[column]<=0, np.nan, X4_num[column])        
        X4_num[column] = X4_num[column].fillna(X4_num[column].mean())
        X4_num[column], _ = stats.boxcox(X4_num[column])

X4_num_norm = normalize_data(X4_num)
cat4_encoded = encode_data(X4_cat)
run_regression(X4_num_norm, cat4_encoded, y4)

The model performs better when removing the outliers before doing the Box Cox transformation 1.

Let's try it out with the second Box Cox transformation:

In [None]:
# Copy & split the dataframe
df5 = df.copy()
df5 = remove_outliers(df5, in_columns=df5.select_dtypes(np.number).columns, 
                         skip_columns=['number of open complaints', 'number of policies',
                         'months since policy inception', 'months since last claim']) 
X5, y5, X5_num, X5_cat = split_data(df5)

# Box Cox transformation excludes monthly premium auto
for column in X5_num:
    # Select the columns with continuous data
    if len(X5_num[column].unique()) > 500:
        # Replace negative data with null values, then replace those with the mean
        X5_num[column] = np.where(X5_num[column]<=0, np.nan, X5_num[column])        
        X5_num[column] = X5_num[column].fillna(X5_num[column].mean())
        X5_num[column], _ = stats.boxcox(X5_num[column])

X5_num_norm = normalize_data(X5_num)
cat5_encoded = encode_data(X5_cat)
run_regression(X5_num_norm, cat5_encoded, y5)

We see that the model performs the same with both Box Cox transformations as long as the outliers were removed beforehand.