# PROJECT -  A2Z CUSTOMER SEGMENTATION

## Introduction
Data herein presented pertains to a fictional insurance company in Portugal, A2Z Insurance. The goal is to develop a customer segmentation that will enable the Marketing Department to better understand the different customers' profiles and develop adequate marketing strategies. <br>
This project is done within the cope of the **Data Mining** curricular unit of the Master's Degree in **Data Science and Advanced Analytics**.

#### Group elements:
* Ivan Jure Parać (20210689)
* Nuno de Bourbon e Carvalho Melo (20210681)
* Stuart Gallina Ottersen (20210703)


## Table of Contents
* [Data exploration](#data-exploration)
* [Data preprocessing](#data-preprocessing)

***

<h2><center>BOILERPLATE</center></h1>

***

In [None]:
# uncomment next line of code to install package required for KPrototypes
# !pip install kmodes

In [None]:
# import major libraries/modules
import pyreadstat
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn.metrics as metrics

# others
from math import ceil
from regressors import stats
from scipy.cluster import hierarchy
from scipy.stats import chi2_contingency
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, AffinityPropagation, OPTICS, MeanShift
from sklearn.feature_selection import RFE

#from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
#from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load SAS file with the insurance company data
df, meta = pyreadstat.read_sas7bdat('a2z_insurance.sas7bdat')

# save copy of the original dataframe
original_df = df.copy()

<a class="anchor" id="data-exploration"></a>

***

<h2><center>DATA EXPLORATION</center></h1>

***

**Section overview**
* First look at the dataset.
* Setting customer ID number as index.
* Removal of duplicate rows. 

In [None]:
# first look at the dataframe
df.head()

In [None]:
# check number of rows and columns
print("Number of observations:", df.shape[0])
print("Number of features:", df.shape[1])
print("Features:", list(df.columns))

In [None]:
# describe the data
df.describe(include = "all").T

In [None]:
# more information about the data
df.info()

In [None]:
# set customer ID as index
df.CustID = df.CustID.astype("int")
df.set_index("CustID", inplace=True)

In [None]:
# check for duplicated rows
print("Number of duplicates:", df.duplicated().sum())

# remove duplicate rows
df.drop_duplicates(inplace = True)
print("Removing duplicates...")
print("Number of duplicates:", df.duplicated().sum())

In [None]:
# check number of rows and columns again
print("Number of observations:", df.shape[0])
print("Number of features:", df.shape[1])

df.head()

<a class="anchor" id="data-preprocessing"></a>

***

<h2><center>DATA PREPROCESSING</center></h1>

***

### Checking data types

In [None]:
# checking data types

# extract number from EducDeg, save as float
df.EducDeg = df.EducDeg.str.extract("(\d+)").astype("float")
# education degree mapper
educ_mapper = {1: "Basic", 2: "High School", 3: "BSc/MSc", 4: "PhD"}

# check data types
df.dtypes

### Removing outliers

In [None]:
# define metric and non-metric features
metric_features = df.columns.drop(["EducDeg", "GeoLivArea", "Children"])
non_metric_features = ["EducDeg", "GeoLivArea", "Children"]

# boxplot of metric features
sns.set()

fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize = (20, 11))

# iterate through axes objects and associate each box plot
for ax, feat in zip(axes.flatten(), metric_features):
    sns.boxplot(x = df[feat], ax = ax)
    
plt.show()

In [None]:
# BirthYear 1028 assumed to be a typo
# 0 and 9 are fairly close in a qwerty keyboard, replaced with 1928
df.loc[df.BirthYear == 1028, "BirthYear"] = 1928

# conditions to remove outliers
filters = ((df.FirstPolYear.ge(2017)),
           (df.MonthSal.ge(20000)),
           (df.CustMonVal.le(-2000)),
           (df.CustMonVal.ge(1500)),
           (df.ClaimsRate.ge(4)),
           (df.PremMotor.ge(3000)),
           (df.PremHousehold.ge(1600)),
           (df.PremHealth.ge(5000)),
           (df.PremWork.ge(300)))

# number of observations before outliers removal
len_df = len(df)

# remove outliers from main dataframe
# create a separate dataframe for the outliers
df_outliers = pd.DataFrame()

for filter_ in filters:
    df_outliers = df_outliers.append(df[filter_])
    df = df[~filter_]
    
# determine number of outliers removed
n_outliers = len(df_outliers)
pc_removed = round(n_outliers/len_df*100, 2)
print(f"Number of outliers removed: {n_outliers} ({pc_removed}% of all observations)")

In [None]:
# boxplots of metric features after removing outliers
sns.set()

# Prepare figure. Create individual axes where each box plot will be placed
fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize=(20, 11))

# Plot data
# Iterate across axes objects and associate each box plot (hint: use the ax argument):
for ax, feat in zip(axes.flatten(), metric_features):
    sns.boxplot(x=df[feat], ax=ax)
    
plt.show()

In [None]:
# new look at the data after removing outliers
df.describe(include = "all").T

### Dealing with missing values

In [None]:
# check feature completeness
# number and percentage of NaN values per feature
nr_nans = df.isna().sum()
pc_nans = df.isna().mean()*100
feature_nans = pd.concat([nr_nans, pc_nans], axis = 1)
feature_nans.rename(columns = {0: "nr", 1: "%"}, inplace = True)

# not enough missing values is a single feature to merit its exclusion
print("Missing values per feature:\n", feature_nans)

In [None]:
# check row completeness
# max number of NaN values in a row and number of rows with that many NaN
max_row_nan = df.isnull().sum(axis = 1).max()
print(f"\nMaximum number of NaN values per row: {max_row_nan} "
      f"({len(df[df.isnull().sum(axis = 1) == max_row_nan])} observations)")

# a row with 4 missing values has ~30% of its data missing (13 features)
# inspecting these rows
max_nan_rows = df[df.isnull().sum(axis = 1) == max_row_nan]
display(max_nan_rows)

# removing these rows - no information about Premiums
df.drop(max_nan_rows.index, inplace = True)

In [None]:
# check outliers dataframe for NaN values
outliers_nan_before = df_outliers.isna().sum()

# only 3 NaN - 1 PremMotor, 1 PremHealth, 1 PremLife
# assuming no info about premiums means no premium is paid
df_outliers.PremMotor.fillna(0, inplace = True)
df_outliers.PremHealth.fillna(0, inplace = True)
df_outliers.PremLife.fillna(0, inplace = True)

# check if NaN values were correctly imputed
outliers_nan_after = df_outliers.isna().sum()
outliers_nan = pd.concat([outliers_nan_before, outliers_nan_after], axis = 1)
outliers_nan.rename(columns = {0: "before", 1: "after"}, inplace = True)

print("Missing values in the outliers' dataframe:")
outliers_nan

In [None]:
# remove rows with missing FirstPolYear and missing BirthYear (44 rows)
df = df[~df.FirstPolYear.isna()]
df = df[~df.BirthYear.isna()]

# remove rows with missing EducDeg
# removes 2 rows (only 2 NaN remaining after removing missing years)
df = df[~df.EducDeg.isna()]

# replace NaN in Premiums with 0
# assumes no info about Premiums means no premium is paid
df.PremMotor.fillna(0, inplace = True)
df.PremHealth.fillna(0, inplace = True)
df.PremLife.fillna(0, inplace = True)
df.PremWork.fillna(0, inplace = True)

In [None]:
# simple linear regression model to impute missing MonthSal values
# based on high correlation between age and salary

# independent, X, and dependent, y, variables
X = df.dropna().BirthYear
y = df.dropna().MonthSal

# split train and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 15)

# reshaping needed because of single feature
X_train = np.array(X_train).reshape(-1, 1)
X_val = np.array(X_val).reshape(-1, 1)

# create and fit model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# predict salary of the validation set
y_pred = lin_reg.predict(X_val)

# evaluate predictions
stats.summary(clf = lin_reg, X = X_train, y = y_train)
mse = metrics.mean_squared_error(y_val, y_pred)
rmse = metrics.mean_squared_error(y_val, y_pred, squared = False)
mae = metrics.mean_absolute_error(y_val, y_pred)

print("\nMean square error:", round(mse, 2))
print("Root mean square error:", round(rmse, 2))
print("Mean absolute error:", round(mae, 2))

In [None]:
# predict MonthSal NaN values
X_test = np.array(df[df.MonthSal.isna()].BirthYear).reshape(-1, 1)
y_pred = lin_reg.predict(X_test)

# impute values to MonthSal NaN
df.loc[df.MonthSal.isna(), "MonthSal"] = y_pred

In [None]:
# multiple linear regression model to impute missing MonthSal values
# use all features but the MonthSal to train the model

# define independent and dependent variables
#X = df.dropna().drop(["MonthSal"], axis = 1)
#y = df.dropna().MonthSal

# split train and test data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)

# scale train and test data
#scaler = MinMaxScaler().fit(X_train)
#X_train_scaled = scaler.transform(X_train)
#X_test_scaled = scaler.transform(X_test)

# create and fit model
#lin_model = LinearRegression()
#lin_model.fit(X_train_scaled, y_train)

# predict y
#y_pred = lin_model.predict(X_test_scaled)

# evaluate the predictions of the linear reg model
#xlabels = X_train.columns
#stats.summary(clf = lin_model, X = X_train_scaled, y = y_train, xlabels = xlabels)
#mse = metrics.mean_squared_error(y_test, y_pred)
#rmse = metrics.mean_squared_error(y_test, y_pred, squared = False)
#mae = metrics.mean_absolute_error(y_test, y_pred)

#print(mse)
#print(rmse)
#print(mae)

In [None]:
# logistic regression to impute missing Children values

# independent, X, and dependent, y, variables
X = df.dropna().drop(columns = "Children")
y = df.dropna().Children

# split data into train (70%) and validation (30%) datasets
# 70% have children, 30% dont, decided to stratify
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 5, stratify = y)

# scale features using MinMaxScaler() with parameters from X_train
scaler = MinMaxScaler().fit(X_train)
# scale the training set
X_train_scaled = scaler.transform(X_train)
# scale the test set
X_val_scaled = scaler.transform(X_val)

In [None]:
# selecting features for logistic regression of Children

# recursive feature elimination
numfeats_list = np.arange(1, len(df.columns))
scores = {}

for n in range(len(numfeats_list)):
    log_reg = LogisticRegression()
    rfe = RFE(log_reg, numfeats_list[n])
    
    X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
    X_val_rfe = rfe.transform(X_val_scaled)
    log_reg.fit(X_train_rfe, y_train)
    
    score = log_reg.score(X_val_rfe, y_val)
    scores[n+1] = score

best_num_feats = max(scores, key = scores.get)
rfe = RFE(estimator = log_reg, n_features_to_select = best_num_feats)
X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
selected_features = pd.Series(rfe.support_, index = X_train.columns, name = "RFE")

# compute correlation between Children and other features
correlations = pd.Series(df.corr().Children, name = "Correlation")

# compute Lasso coefficients
reg = LassoCV()
reg.fit(X_train_scaled, y_train)
lasso_coef = pd.Series(reg.coef_, index = X_train.columns, name = "Lasso")

# concatenate features selected by rfe, correlations, and lasso coefficients
selection_df = pd.concat([selected_features, correlations, lasso_coef], axis = 1).drop("Children")

# plot correlation and lasso coefficients
coef_names = ["Correlation", "Lasso"]

sns.set(font_scale = 1.4)
sns.set_style("white")
fig, axes = plt.subplots(1, ceil(len(coef_names)), figsize = (22, 10))

for ax, coef in zip(axes.flatten(), coef_names):
    sns.barplot(data = selection_df,
                x = coef,
                y = selection_df.index,
                hue = "RFE",
                palette = ["darkgray", "palevioletred"],
                order = selection_df.sort_values(coef).index,
                ax = ax)
    ax.axvline(x = 0, linestyle = ":", color = "darkgray", label = "_nolegend_")
    ax.set_xlabel(coef, fontsize = 16)
    ax.legend(title = "RFE", loc = "upper right", fontsize = 14)

plt.show()

In [None]:
# imputing missing Children values
# conclusion from feature selection: use only BirthYear

# independent, X, and dependent, y, variables
X = df.dropna().BirthYear
y = df.dropna().Children

# split data into train (70%) and validation (30%) datasets
# 70% have children, 30% dont, decided to stratify
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 5, stratify = y)

# reshaping needed because of single feature
X_train = np.array(X_train).reshape(-1, 1)
X_val = np.array(X_val).reshape(-1, 1)

# scale features using MinMaxScaler() with parameters from X_train
scaler = MinMaxScaler().fit(X_train)
# scale the training set
X_train_scaled = scaler.transform(X_train)
# scale the test set
X_val_scaled = scaler.transform(X_val)

# create a logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# predict y
y_pred = log_reg.predict(X_val_scaled)

# evaluate the predictions of the logistic reg model
conf_matrix = metrics.confusion_matrix(y_val, y_pred)
accuracy = round(metrics.accuracy_score(y_val, y_pred)*100, 2)
precision = round(metrics.precision_score(y_val, y_pred)*100, 2)
recall = round(metrics.recall_score(y_val, y_pred)*100, 2)
f1 = round(metrics.f1_score(y_val, y_pred)*100, 2)

print("Confusion matrix:\n", conf_matrix)
print("Accuracy:", accuracy, "%")
print("Precision:", precision, "%")
print("Recall:", recall, "%")
print("F1 score:", f1, "%")

In [None]:
# attempting children prediction using KNN
# overall results were worse than with logistic regression

#data = df.dropna().drop(["Children", "GeoLivArea"], axis = 1)
#target = df.dropna().Children

#X_train, X_val, y_train, y_val = train_test_split(data, target, train_size=0.70, stratify = target, random_state=5)

#modelKNN = KNeighborsClassifier()
#modelKNN.fit(X = X_train, y = y_train)
#labels_train = modelKNN.predict(X_train)
#labels_val = modelKNN.predict(X_val)

#conf_matrix = metrics.confusion_matrix(y_val, labels_val)
#accuracy = round(metrics.accuracy_score(y_val, labels_val)*100, 2)
#precision = round(metrics.precision_score(y_val, labels_val)*100, 2)
#recall = round(metrics.recall_score(y_val, labels_val)*100, 2)
#f1 = round(metrics.f1_score(y_val, labels_val)*100, 2)

#print("Confusion matrix:\n", conf_matrix)
#print("Accuracy:", accuracy, "%")
#print("Precision:", precision, "%")
#print("Recall:", recall, "%")
#print("F1 score:", f1, "%") 

In [None]:
# predict Children NaN values
X_test = df.loc[df.Children.isna(), "BirthYear"]
X_test = np.array(X_test).reshape(-1, 1)
X_test_scaled = scaler.transform(X_test)
y_pred = log_reg.predict(X_test_scaled)

# impute values to Children NaN
df.loc[df.Children.isna(), "Children"] = y_pred

In [None]:
# check if all NaN values were dealt with
df.isna().sum()

### Data transformation and cross-field validation

In [None]:
# get rows where policy was made before birth
incoherent_dates_nr = len(df[df.BirthYear > df.FirstPolYear])
incoherent_dates_pc = round(len(incoherent_dates)/len(df)*100, 1)
print(f"Number of people with a policy before birth: {incoherent_dates_nr} "
      f"({incoherent_dates_pc}% of the dataset)")

In [None]:
# such high number of inconsistencies suggests systematic error
# assumption: in these cases BirthYear and FirstPolYear were introduced in the wrong fields

# swap FirstPolYear and BirthYear values when birth is after first policy creation
row_filter = df.BirthYear > df.FirstPolYear
df.loc[row_filter, ["FirstPolYear", "BirthYear"]] = df.loc[row_filter, ["BirthYear", "FirstPolYear"]].values

In [None]:
# replace BirthYear with customer Age (in 2016)
# age is more intuitive than birth year
curr_year = 2016
cust_ages = curr_year - df.BirthYear
df.insert(2, "Age", cust_ages)

# drop BirthYear as it provides the same information
df.drop(columns = "BirthYear", inplace = True)

In [None]:
# get some stats regarding the Age column
df.Age.describe()

In [None]:
# cross-validate EducDeg and Age
# minimum age is 18 - everyone can have an education up to High School
# minimum age of 20 for a BSc
# minimum age of 23 for a PhD (skipping MSc and finishing in 3 years, UK or outside of the EU)
print("Minimum age associated to each Education Degree:")
educdeg_min_age = df.groupby("EducDeg").Age.min().rename(index = educ_mapper)
educdeg_min_age

# no incoherences in EducDeg

# **Cleaning up the dataset**

## Data transformation (and more cross-field validation)

In [None]:
# create an Age column (present year considered to be 2016)
curr_year = 2016

# insert ages as the 3rd feature, after BirthYear
df.insert(2, "Age", curr_year - df.BirthYear)

# plot distribution of ages
plt.figure(figsize=(10, 8))
plt.title("Customer ages")
sns.axes_style("dark")
sns.violinplot(y=df["Age"])
plt.show()

# get some stats regarding the Age column
df.Age.describe()

In [None]:
# check if EducDeg makes sense according to Age
# min age is 18 so everyone can have an education up to High School
# check youngest people with a BSc/MSc
print("Minimum age associated to each Education Degree:")
print(df.groupby("EducDeg").Age.min())

# finishing a BSc (EducDeg = 3) at 20 yo is possible if starting at 17
# finishing a PhD (EducDeg = 4) at 23 yo is possible if skipping MSc and finishing PhD in 3 years (UK or outside of EU)
# no incoherences in EducDeg

In [None]:
# create a FirstPolAge column (present year considered to be 2016)
first_pol_age = df.FirstPolYear - df.BirthYear
df.insert(1, "FirstPolAge", first_pol_age)

# plot distribution of age of first policy
plt.figure(figsize=(10, 8))
plt.title("Age of first policy")
sns.axes_style("dark")
sns.violinplot(y=df["FirstPolAge"])
plt.show()

# get some stats regarding the Age column
df.FirstPolAge.describe()

In [None]:
# create a Generation column based on birth year
# could use LabelEncoder or OrdinalEncoder here but I couldn't make it work??????????????
df.loc[(df.BirthYear >= 1928) & (df.BirthYear <= 1945), "Generation"] = 1 # Silent Gen
df.loc[(df.BirthYear >= 1946) & (df.BirthYear <= 1964), "Generation"] = 2 # Baby Boomer
df.loc[(df.BirthYear >= 1965) & (df.BirthYear <= 1980), "Generation"] = 3 # Gen X
df.loc[(df.BirthYear >= 1981) & (df.BirthYear <= 1995), "Generation"] = 4 # Millennial
df.loc[(df.BirthYear >= 1996) & (df.BirthYear <= 2010), "Generation"] = 5 # Gen Z

# generation mapper
gen_mapper = {1: "Silent Gen",
              2: "Baby Boomer",
              3: "Gen X",
              4: "Millennial",
              5: "Gen Z"}

# convert Generation data to categorical
df.Generation = df.Generation.astype("category")

# get some stats regarding the Generation column
df.Generation.describe()

In [None]:
# count number of customers per generation
gen_count = df.groupby("Generation").size().sort_values(ascending = False)
gen_count.rename(index = gen_mapper, inplace = True)

# plot number of customers per generation
plt.figure(figsize=(10,8))
plt.title("Number of customers per generation")
sns.axes_style("dark")
sns.barplot(x = gen_count.index, y = gen_count.values, order = gen_count.index)
plt.show()

In [None]:
# create a YearSal column
# Premiums are also expressed as yearly values
# could be interesting to try YearSal as a categorical variable???
df.insert(5, "YearSal", df.MonthSal*12)

In [None]:
# create a PremTotal column
premium_cols = ["PremMotor", "PremHousehold", "PremHealth", "PremLife", "PremWork"]
df["PremTotal"] = df[premium_cols].sum(axis = 1)

# get some stats regarding the PremTotal column
print(df.PremTotal.describe())

# deal with PremTotal outliers??????????????
sns.boxplot(x = df.PremTotal)

In [None]:
# no observations where PremTotal is higher than the yearly salary
len(df[df.PremTotal > df.YearSal])

In [None]:
# what to do when no premiums were paid in 2016?????
# it could mean that the insurance was cancelled and they are no longer customers

# 12 observations with no Premiums paid in 2016
print(f"{len(df[df.PremTotal == 0])} customer(s) paid no Premium and were removed")
# removed these 12 observations as they likely represent past customers
df = df[~(df.PremTotal == 0)]

In [None]:
# create a CustYears column
# number of years a customer has been a customer
df["CustYears"] = curr_year - df.FirstPolYear

# get some stats regarding the CustYears column
df.CustYears.describe()

In [None]:
# CustMonVal, PremHousehold, PremHealth, PremLife, PremWork may require further processing
# histograms
skewed_metric_features = ["CustMonVal", "PremHousehold", "PremHealth", "PremLife", "PremWork", "PremTotal"]

sns.set()

# Prepare figure. Create individual axes where each box plot will be placed
fig, axes = plt.subplots(2, 3, figsize=(20, 11))

# Plot data
# Iterate across axes objects and associate each box plot (hint: use the ax argument):
for ax, feat in zip(axes.flatten(), skewed_metric_features):
    sns.histplot(x=df[feat], ax=ax)
    
plt.show()

In [None]:
# summing the module of the minimum value to all observations

# storing pre-transforming premiums for later use
no_transform_household = df.PremHousehold.copy()
no_transform_health = df.PremHealth.copy()
no_transform_life = df.PremLife.copy()
no_transform_work = df.PremWork.copy()
no_transform_total = df.PremTotal.copy()

# applying transformation to normalize distributions
df.PremHousehold = np.sqrt(df.PremHousehold + 75)
df.PremHealth = np.sqrt(df.PremHealth + 2.11)
df.PremLife = np.sqrt(df.PremLife + 7)
df.PremWork = np.sqrt(df.PremWork + 12)
df.PremTotal = np.sqrt(df.PremTotal)

sns.set()

fig, axes = plt.subplots(2, 3, figsize = (20, 11))

for ax, feat in zip(axes.flatten(), skewed_metric_features):
    sns.histplot(x=df[feat], ax=ax)
    
plt.show()

In [None]:
# percentage of observations discarded
num_obs_discarded = len_df-len(df)
pc_obs_discarded = round(num_obs_discarded/len_df*100, 2)
print(f"Number of observations discarded: {num_obs_discarded} ({pc_obs_discarded} %)")

In [None]:
# new look at the dataframe
df

In [None]:
# create a heatmap showing correlation between all metric attributes
# pearson, spearman ????????????????
plt.subplots(figsize=(15,12))
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
corr_heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
corr_heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

In [None]:
# high correlations:
# FirstPoLYear redundant with CustYears, remove FirstPoLYear as the latter is more intuitive to work with
# Age redundant with BirthYear, remove BirthYear as Age is more intuitive to work with
# MonthSal redundant with YearSal, remove MonthSal as Premiums are also yearly values
# CustMonVal redundant with ClaimsRate, and ClaimsRate not correlated with anything else, remove ClaimsRate
# 
df.drop(columns = ["FirstPolYear", "BirthYear", "ClaimsRate", "MonthSal"], inplace = True)

In [None]:
# create a heatmap showing correlation between the metric attributes selected
plt.subplots(figsize=(15,12))
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
corr_heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
corr_heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

In [None]:
# bidimensional representation of metric attributes
sns.pairplot(df)

In [None]:
# updating metric and non-metric features
metric_features = df.select_dtypes(include = np.number).columns.tolist()
non_metric_features = df.columns.drop(metric_features).tolist()

print("Metric Features:", metric_features)
print("Non-metric Features:", non_metric_features)

In [None]:
# explore EducDeg categorical feature
sns.set()

fig, axes = plt.subplots(6, 2, figsize=(15, 25))

for ax, feat in zip(axes.flatten(), metric_features):
    sns.boxplot(x=df["EducDeg"], y=df[feat], ax=ax)
    
plt.show()

In [None]:
# we do not know much about GeoLivArea or how relevant it might be
# explore GeoLivArea categorical feature
sns.set()

fig, axes = plt.subplots(6, 2, figsize=(15, 25))

for ax, feat in zip(axes.flatten(), metric_features):
    sns.boxplot(x=df["GeoLivArea"], y=df[feat], ax=ax)
    
plt.show()

In [None]:
# explore Children categorical feature
sns.set()

fig, axes = plt.subplots(6, 2, figsize=(15, 25))

for ax, feat in zip(axes.flatten(), metric_features):
    sns.boxplot(x=df["Children"], y=df[feat], ax=ax)
    
plt.show()

# kinda weird that median age of people with children is lower than for people without?

In [None]:
# explore Generation categorical feature
sns.set()

fig, axes = plt.subplots(6, 2, figsize=(15, 25))

for ax, feat in zip(axes.flatten(), metric_features):
    sns.boxplot(x=df["Generation"], y=df[feat], ax=ax)
    
plt.show()

In [None]:
# Children and EducDeg appear to have some discriminating power
# Generation also appears to be interesting if we use age as categorical
# GeoLivArea does not seem to be particularly useful
df.drop(columns = "GeoLivArea", inplace = True)

# should we retrieve the 1 observation removed because GeoLivArea was NaN??????????

In [None]:
df

In [None]:
# reorganizing dataframe to be more intuitive
df = df.loc[:, ["Generation",
                "Age",
                "FirstPolAge",
                "CustYears",
                "EducDeg",
                "Children",
                "YearSal",
                "CustMonVal",
                "PremMotor",
                "PremHousehold",
                "PremHealth",
                "PremLife",
                "PremWork",
                "PremTotal"]]

df

## Scale metric features and encode categories as binary features

In [None]:
# updating metric and non-metric features

# converting to float to have all variables scaled if needed
df.EducDeg = df.EducDeg.astype("float")
df.Generation = df.Generation.astype("float")
df.Children = df.Children.astype("float")

metric_features = df.select_dtypes(include = np.number).columns.tolist()
non_metric_features = df.columns.drop(metric_features).tolist()

print("Metric Features:", metric_features)
print("Non-metric Features:", non_metric_features)

In [None]:
# scale metric features and store them as df_scaled
# alternative: StandardScaler (got better results with MinMaxScaler)
# create a dataframe with the scaled metric variables
df_scaled = df.copy()
scaler = StandardScaler()
scaled_feat = scaler.fit_transform(df_scaled[metric_features])
df_scaled[metric_features] = scaled_feat

In [None]:
# there are two dataframes at this point
# the dataframe obtained prior to normalisation

df.PremHousehold = no_transform_household
df.PremHealth = no_transform_health
df.PremLife = no_transform_life
df.PremWork = no_transform_work
df.PremTotal = no_transform_total

df.describe(include="all").T

In [None]:
# the dataframe with the scaled metric variables
df_scaled.describe().T

# Sociodemographic clustering

Attempted sociodemographic clustering using different algorithms.
KPrototypes was used first to handle Children as a categorical variable, and clusters were agglomerated based on the visualisation of a dendrogram.
AgglomerativeClustering and KMeans produce the exact same clusters, and these are very similar to KPrototypes. However, there is perfect split between people with and without children (KPrototypes also splits them into two groups but they are not pure).

In [None]:
# attempting sociodemographic clustering
# include age, generation, yearly salary, education degree, and children
df_sociodem = df[["Age", "Generation", "YearSal", "EducDeg", "Children"]]
df_sociodem.Children = df_sociodem.Children.astype("category")
df_sociodem.dtypes

In [None]:
df_sociodem

## Sociodemographic Clustering

For sociodemographic clustering, we used the following algorithms (and features):
1. KPrototypes (Age/Generation, EducDeg, Children)
2. Agglomerative Clustering (Age/Generation, EducDeg, Children)
3. Agglomerative Clustering (Age/Generation, EducDeg)
4. KMeans (Age/Generation, EducDeg, Children)
5. KMeans ((Age/Generation, EducDeg)

Tried DBScan and Mean Shift but because they are density based the weight they give to the Children binary value makes it so that only two clusters are found. Discarding Children and using only Age and EducDeg did not lead to better solutions.

Agglomerative Clustering and KMeans, because they are distance-based clustering algorithms, end up giving a lot of weight to binary variables, in this case Children. When analysing the results and the previous box plots, we concluded that while presence or absence of Children had some impact in a couple of the Premiums, that did not justify the weight they were receiving using these algorithms. For that reason, we tried both of them after discarding the feature Children. However, we felt uncomfortable losing Children as the boxplot suggests that it affects how much people pay for their Health insurance, which is the only Premium that does not appear to significantly change with EducDeg and it is not at all correlated to Age.

We opted with KMeans (k = 7) as even though it attributes significant weight to children (being a binary variable), the results are similar to KPrototypes (k = 6), with the added advantage of allowing the identification of the sociodemographic group with the highest CMV by far. So overweighing children did not appear to bias the final conclusions. Plus, KPrototypes is excruciantingly slow.

In KPrototypes, we tried 3 approaches:
1. Create a high number of clusters (15) and agglomerate based on distance (as determined via a dendrogram)
2. Create 4 clusters directly
3. Create 6 clusters directly

In [None]:
# delete this cell if not using hierarchical clustering

def hierarchical_clusters(df, n_clusters = 2, threshold = None, affinity = "euclidean", linkage = "ward"):
    
    # determine clusters
    clusters = AgglomerativeClustering(n_clusters = n_clusters,
                                       affinity = affinity,
                                       linkage = linkage,
                                       distance_threshold = threshold)
    clusters.fit(df)
    
    # retrieve cluster labels and distances
    labels = clusters.labels_
    distances = clusters.distances_
    
    counts = np.zeros(clusters.children_.shape[0])
    n_samples = len(labels)
    
    for i, merge in enumerate(clusters.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    
    linkage_matrix = np.column_stack([clusters.children_, distances, counts]).astype(float)
    
    return (labels, distances, linkage_matrix)

In [None]:
# automate categorical feature detection

def elbow_plot(df, nmax_clusters):
    
    n_clusters = np.arange(2, nmax_clusters+1)
    cost_vals = []
    
    for n in n_clusters:
        kp_clusters = KPrototypes(n_clusters = n, init = "Huang", random_state = 15)
        kp_clusters.fit(df, categorical = [2])
        cost = kp_clusters.cost_
        cost_vals.append(cost)
        
    plt.subplots(figsize=(8, 8))
    sns.lineplot(x = np.arange(2, nmax_clusters + 1), y = cost_vals)

___
___
KP 1 - Age, EducDeg, Children (k = 4)

In [None]:
#kp1_sociodem = pd.concat([df_scaled[["Age", "EducDeg"]], df["Children"]], axis = 1)

In [None]:
#elbow_plot(kp1_sociodem, 15)

In [None]:
#kp1_clusters = KPrototypes(n_clusters = 4, init = "Huang", random_state = 15)
#kp1_clusters.fit(kp1_sociodem, categorical = [2])
#kp1_labels = kp1_clusters.labels_
#kp1_centroids = kp1_clusters.cluster_centroids_
#kp1_linkage = hierarchy.linkage(kp1_centroids, method = "ward")

In [None]:
#hierarchy.dendrogram(kp1_linkage, color_threshold = 2.0);

In [None]:
#df["Cluster"] = kp1_labels
#df.groupby("Cluster").mean().sort_values("Age")

___
___
KP2 - Age, EducDeg, Children (k = 6)

In [None]:
#kp2_clusters = KPrototypes(n_clusters = 6, init = "Huang", random_state = 15)
#kp2_clusters.fit(kp1_sociodem, categorical = [2])
#kp2_labels = kp2_clusters.labels_
#kp2_centroids = kp2_clusters.cluster_centroids_

In [None]:
#kp2_linkage = hierarchy.linkage(kp2_centroids, method = "ward")
#hierarchy.dendrogram(kp2_linkage, color_threshold = 2.0);

In [None]:
#df["Cluster"] = kp2_labels
#df.groupby("Cluster").mean().sort_values("Age")

___
___
KMeans 2 - Age, EducDeg, Children

KMeans + Hierarchical Clustering produced very similar results, so might as well go with this (simpler) approach.

In [None]:
km1_sociodem = df_scaled[["Age", "EducDeg", "Children"]]
# km2_sociodem = pd.concat([df_scaled[["Age", "EducDeg"]], df["Children"]], axis = 1)

In [None]:
nmax_clusters = 15
n_clusters = np.arange(2, nmax_clusters + 1)
inertia_vals = []
    
for n in n_clusters:
    km_clusters = KMeans(n_clusters = n, random_state = 15)
    km_clusters.fit(km1_sociodem)
    inertia = km_clusters.inertia_
    inertia_vals.append(inertia)
        
plt.subplots(figsize=(8, 8))
sns.lineplot(x = np.arange(2, nmax_clusters + 1), y = inertia_vals)

# elbow plot indicates 6-8 clusters

In [None]:
km1_clusters = KMeans(n_clusters = 7)
km1_clusters.fit(km1_sociodem)
km1_labels = km1_clusters.labels_
df["Cluster"] = km1_labels
df.groupby("Cluster").mean().sort_values("Age")