In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The dataset analysed is called "Sweden Insurance". In it, we have:
* the variable x, called "Number of Claims"
* and y, or explained variable called "Total Payments". 

Here, we are going to construct a model that best explain Total_payments with Number of claims as input variable. 
Let's do this!

First, we load main libraries as well as dataset:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set_style('darkgrid')

In [None]:
dataset = pd.read_csv(r'/kaggle/input/auto-insurance-in-sweden/swedish_insurance.csv')

In [None]:
dataset.info()

After loading the dataset, we can see that there is no null or empty value. The following task that we are doing is cheking and erasing duplicated values: 

In [None]:
dataset = dataset.drop_duplicates()
dataset.info()

In [None]:
dataset = dataset.rename(columns = {'X':'Num_Claims', 'Y':'Total_payments'})
X = pd.DataFrame(data = dataset['Num_Claims'])
y = pd.DataFrame(data = dataset['Total_payments'])

Since the resulting rows are the same (63) we can conclude that there were no duplicates rows in the initial dataset.
Secondly, we are using a pairplot for: 
* Checking the relationship between the two variables.
* Cheking the distribution of X and y.

In [None]:
sbn.pairplot(dataset)

We can see in the previous plot that: 
* A linear relationship may exist.
* The variables are not normal.
* There are some values that could be considered as outliers.

1. Baseline model. 

Prior to any modification in the dataset, we are creating what could be considered as a naive or baseline regressor. That is to say,a model with the "as is" version of the dataset and no hyperparameter modified. 

We load required functions from Sklearn.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

Now, we create a Kfold element and cross validation score:


In [None]:
cv = RepeatedKFold(n_splits = 2, n_repeats =15, random_state = 0)
baseline_score = cross_val_score(LinearRegression(), X, y, scoring = 'r2', cv = cv, n_jobs = -1)
sbn.boxplot(baseline_score, orient = 'v').set_title('R2 baseline model')
print('Average R2 of baseline model: ', np.mean(baseline_score))

Results of Baseline model: R2 is 77% on average and IQR ranges between +50% and 90% aproximately.

Outliers. Here we use a boxplot in order to double-check what seemed to be outliers in the pairplot. 

In [None]:
sbn.boxplot(x = 'variable', y = 'value', data = pd.melt(dataset))

As we spotted in the previous graph, there are some X values that could be considered as outliers. 

One strategy can consist in erasing those values in order to better fit a model. However, since the dataset has only 63 rows, we should first consider what % we would be missing in case of dropping them.

As a rule of thumb, we could consider an outlier that point beyond the 75th percentile. However, I have considered different tresholds:


In [None]:
number_outliers = []
limits = [0.75,0.80,0.90,0.95,0.99]
for i in  limits:
    are_outliers = X > np.quantile(X, i)
    X_noout = np.ma.masked_array(data = X, mask = are_outliers)
    X_noout = np.ma.compressed(X_noout)
    number_outliers.append(round((X.shape[0] - X_noout.shape[0])/X.shape[0],2)*100)

number_outliers

The previous result means that if we consider and erase data values beyond: 
1. 75th percentile, we sacrifice 24% of our dataset.
2. 80th percentile, we sacrifice 21% of our dataset.
3. 90th percentile, we sacrifice 11% of our dataset.
4. 95th percentile, we sacrifice 6% of our dataset.
5. 99th percentile, we sacrifice 2% of our dataset.

Here, we are considering outliers those beyond the 95% of Total Claims, meaning that we are sacrificing a 6% of available data. 

In [None]:
X_noout = np.ma.masked_array(data = X, mask = X>np.quantile(X,0.95))
y_noout = np.ma.masked_array(data = y, mask = X>np.quantile(X,0.95))
X_noout = np.ma.compressed(X_noout)
y_noout = np.ma.compressed(y_noout)


In [None]:
cv = RepeatedKFold(n_splits = 2, n_repeats =15, random_state = 0)
nooutlier_score = cross_val_score(LinearRegression(), X_noout.reshape(-1,1), y_noout.reshape(-1,1), scoring = 'r2', cv = cv, n_jobs = -1)
sbn.boxplot(data = [baseline_score,nooutlier_score]).set_title('R2 of baseline model (left), and no-outliers model (right)')

As we can see in the previous boxplot, eliminating extreme values here in this example results in an underfiting of the model.
For that reason, we will use the complete dataset when building the different models.

The following step will consist in finding the best regressors and comparing their goodness.

2. Lasso model

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

params_lasso = {'alpha': [0,0.05,0.10, 0.20, 0.30, 0.50, 0.70, 1, 2, 3, 4, 5, 6, 8, 10, 12, 15, 18, 19, 20]}
grid_lasso = GridSearchCV(estimator = Lasso(), param_grid = params_lasso,scoring = 'r2',cv = 5, refit = True, verbose = 3,n_jobs = -1)
grid_lasso.fit(X, y)
print('Best Lasso parameter: ', grid_lasso.best_params_)

3. RandomTreeRegressor:

In [None]:
from sklearn.ensemble import RandomForestRegressor
params_forest = {'n_estimators': [50, 100, 150],
                 'criterion':['mse', 'mae'],
                 'max_depth': [2, 5, 10, 20, 30], 
                 'min_samples_split':[2, 5, 8, 10, 15, 20, 25, 30, 50], 
                 'min_samples_leaf': [2, 3, 4, 5, 6, 10, 20] 
                }
grid_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = params_forest,scoring = 'r2',cv = 5, refit = True, verbose = 3, n_jobs = -1)
grid_forest.fit(X, y)
print('Best Random Forest parameter: ', grid_forest.best_params_)


Now that we have constructed the three models ( baseline, Lasso and RandomForest), we are comparing: 

In [None]:
models = []
models.append(('Baseline', LinearRegression()))
models.append(('Lasso', Lasso(alpha = 18)))
models.append(('RandomForest', RandomForestRegressor(criterion  = 'mae', max_depth = 5, min_samples_leaf = 3, min_samples_split = 15, n_estimators = 50)))
results = []
names = []
scoring = 'r2'
for name, model in models:
    kfold = RepeatedKFold(n_splits = 2, n_repeats = 10, random_state = 0)
    cv_results = cross_val_score(model, X, y, cv = kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    print('R2 of: ',name, '= ',cv_results.mean())

In [None]:
results_final = pd.DataFrame(results).T
results_final.set_axis(names, axis = 1, inplace = True)
sbn.boxplot(x = 'variable', y = 'value', data = pd.melt(results_final))

We have considered three possible models: 
1. Simple Linear Regression
2. Lasso Regression
3. Random Forest Regression

and by searching the best parameters and conducting a kfold cross validation, we can conclude that, in this case, the baseline model is the best one in terms of R2. 