In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("../../data_files/csv_files/marketing_customer_analysis.csv")
data.columns = [i.lower().replace(" ","_") for i in data.columns]
y_data = data.total_claim_amount
X_data = data.drop(["total_claim_amount", "customer"], axis=1)

### Base Model : Only considering monthly_premium_auto

In [2]:
X_data = X_data[['customer_lifetime_value', 'income', 'monthly_premium_auto', 
       'coverage', 'education', 'employmentstatus', 'location_code',
       'marital_status', 'vehicle_class']]
X_categorical = X_data.select_dtypes(exclude=np.number)

In [3]:
y = y_data
X = X_data[["monthly_premium_auto"]]
X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

In [5]:
print(f'Base model rmse: {rmse}')
print(f'Error percentage: {error_percentage}')
print(f'Base model r2 score: {r2}')

Base model rmse: 223.17213469912434
Error percentage: 0.5214128846017893
Base model r2 score: 0.38427731771068707


### Model 2 : All numerical columns

In [6]:
def transformation(col):
    return np.log(col +1)

In [7]:
y_data = data.total_claim_amount
X_data = data.drop(["total_claim_amount", "customer"], axis=1)
X_data = X_data.select_dtypes(include=np.number)
X_data["customer_lifetime_value"] = X_data.customer_lifetime_value.map(transformation)

In [8]:
y = y_data
X = X_data
#X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

print(f'Base model rmse: {rmse}')
print(f'Error percentage: {error_percentage}')
print(f'Base model r2 score: {r2}')

Base model rmse: 199.36770609505658
Error percentage: 0.4657969099574735
Base model r2 score: 0.5086229809228611


### Model 3 : Only numerical columns with some correlation

In [10]:
def transformation(col):
    return np.log(col +1)
X_data["customer_lifetime_value"] = X_data.customer_lifetime_value.map(transformation)

In [11]:
y = y_data
X = X_data.select_dtypes(include=np.number)
X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
## Fit and test the model

lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

In [13]:
print(f'Rmse: {np.round(rmse,10)}')
print(f'Error percentage: {np.round(error_percentage,10)}')
print(f'Base model r2 score: {np.round(r2,10)}')

Rmse: 199.3647455523
Error percentage: 0.465789993
Base model r2 score: 0.5086375744


### Model 4 : Dropping outliers in monthly_premium_auto

In [14]:
def get_outliers_indices(column_name, data = data):
    q1 = np.percentile(data[column_name], 25)
    q3 = np.percentile(data[column_name], 75) 
    iqr = q3-q1
    condition_1 = data[column_name] > q1 - 1.5* iqr
    condition_2 = data[column_name] < (q3 + 1.5 * iqr)
    
    ## You need to negate the interception of both conditions
    return (data[column_name][~(condition_1 & condition_2)].index)

In [15]:
y_data = data.total_claim_amount
X_data = data.drop(["total_claim_amount", "customer"], axis=1)
X_data = X_data.select_dtypes(include=np.number)
X_data["customer_lifetime_value"] = X_data.customer_lifetime_value.map(transformation)
droping_indices = get_outliers_indices("monthly_premium_auto")

In [16]:
y = y_data
X = X_data.select_dtypes(include=np.number)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## I only wanted to drop in the training data so the testing data is not compromised
X_train = X_train.drop(droping_indices, errors='ignore')
y_train = y_train.drop(droping_indices, errors='ignore')

## It is easier to scale after the droping, because the drop is made within the pandas library
X_train = scaler = StandardScaler().fit_transform(X_train, y_train)
X_test = scaler = StandardScaler().fit_transform(X_test, y_test)

In [17]:
## Fit and test the model

lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

In [18]:
print(f'Rmse: {np.round(rmse,10)}')
print(f'Error percentage: {np.round(error_percentage,5)}')
print(f'R2 score: {np.round(r2,5)}')

Rmse: 210.5927019346
Error percentage: 0.49202
R2 score: 0.45173


### Model 5 :  Droping from the target column

In [19]:
y_data = data.total_claim_amount
X_data = data.drop(["total_claim_amount", "customer"], axis=1)
X_data = X_data.select_dtypes(include=np.number)
X_data["customer_lifetime_value"] = X_data.customer_lifetime_value.map(transformation)
droping_indices = get_outliers_indices("total_claim_amount")

In [20]:
y = y_data
X = X_data.select_dtypes(include=np.number)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## I only wanted to drop in the training data so the testing data is not compromised
X_train = X_train.drop(droping_indices, errors='ignore')
y_train = y_train.drop(droping_indices, errors='ignore')

## It is easier to scale after the droping, because the drop is made within the pandas library
X_train = scaler = StandardScaler().fit_transform(X_train, y_train)
X_test = scaler = StandardScaler().fit_transform(X_test, y_test)

In [21]:
## Fit and test the model

lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

In [22]:
print(f'Rmse: {np.round(rmse,2)}')
print(f'Error percentage: {np.round(error_percentage,4)}')
print(f'R2 score: {np.round(r2,4)}')

Rmse: 218.17
Error percentage: 0.5097
R2 score: 0.4116


### Model 6 : Adding the categorical values

In [23]:
X_categorical.head()
encoder = OneHotEncoder(drop="first").fit(X_categorical)
encoded = encoder.transform(X_categorical)

In [24]:
y = y_data
X = X_data.select_dtypes(include=np.number)
X = pd.concat([X, pd.DataFrame(encoded.todense())], axis=1)
X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [25]:
lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

In [26]:
print(f'Base model rmse: {rmse}')
print(f'Error percentage: {error_percentage}')
print(f'Base model r2 score: {r2}')

Base model rmse: 138.34262758309882
Error percentage: 0.32321969142224477
Base model r2 score: 0.7633986285465483


### Model 7 : Drop categorical with some collinearity

In [27]:
encoder = OneHotEncoder(drop="first").fit(X_categorical)
encoded = encoder.transform(X_categorical)

y = y_data
X = X_data.select_dtypes(include=np.number)
X = pd.concat([X, pd.DataFrame(encoded.todense()).drop([6,12],axis=1)], axis=1)
X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

print(f'Base model rmse: {rmse}')
print(f'Error percentage: {error_percentage}')
print(f'Base model r2 score: {r2}')

Base model rmse: 138.3609055245826
Error percentage: 0.3232623954731178
Base model r2 score: 0.7633361044817406


### Model 8 : Change test_size to 40

In [28]:
encoder = OneHotEncoder(drop="first").fit(X_categorical)
encoded = encoder.transform(X_categorical)
X_categorical.head()
encoder = OneHotEncoder(drop="first").fit(X_categorical)
encoded = encoder.transform(X_categorical)
pd.DataFrame(encoded.todense())

y = y_data
X = X_data.select_dtypes(include=np.number)
X = pd.concat([X, pd.DataFrame(encoded.todense())], axis=1)
X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)
lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

print(f'Model rmse: {rmse}')
print(f'Error percentage: {error_percentage}')
print(f'Base model r2 score: {r2}')

Model rmse: 137.85119864129055
Error percentage: 0.31999711360883104
Base model r2 score: 0.7666184937340421


### Model 9 : Change test_size to 58

In [29]:
X_categorical.head()
encoder = OneHotEncoder(drop="first").fit(X_categorical)
encoded = encoder.transform(X_categorical)
X_categorical.head()
encoder = OneHotEncoder(drop="first").fit(X_categorical)
encoded = encoder.transform(X_categorical)
pd.DataFrame(encoded.todense())

y = y_data
X = X_data.select_dtypes(include=np.number)
X = pd.concat([X, pd.DataFrame(encoded.todense())], axis=1)
X = scaler = StandardScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.58, random_state=42)
lm = LinearRegression().fit(X=X_train,y=y_train)
prediction = lm.predict(X_test)

mse = metrics.mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
error_percentage = rmse / y_test.mean()
r2 = metrics.r2_score(y_test,prediction)

print(f'Model rmse: {rmse}')
print(f'Error percentage: {error_percentage}')
print(f'Base model r2 score: {r2}')

Model rmse: 138.2932152388424
Error percentage: 0.3208512536707973
Base model r2 score: 0.7705486910243695
