In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

%matplotlib inline

pd.set_option('display.max_columns', None)

In [2]:
customer_df = pd.read_csv('files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv')
customer_df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,Suburban,Married,69,32,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,Suburban,Single,94,13,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,Suburban,Married,108,18,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,Suburban,Married,106,18,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,Rural,Single,73,12,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


# Cleaning

In [3]:
new_columns = []
for i in customer_df.columns:
    i = i.lower().replace(' ', '_')
    new_columns.append(i)

customer_df.columns = new_columns
customer_df.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employmentstatus', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [4]:
customer_df['employment_status'] = customer_df.employmentstatus
customer_df.drop(['employmentstatus'],axis=1)

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,gender,income,location_code,marital_status,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,employment_status
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,F,56274,Suburban,Married,69,32,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,Employed
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,F,0,Suburban,Single,94,13,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize,Unemployed
2,AI49188,Nevada,12887.431650,No,Premium,Bachelor,2/19/11,F,48767,Suburban,Married,108,18,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize,Employed
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,M,0,Suburban,Married,106,18,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize,Unemployed
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,M,43836,Rural,Single,73,12,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize,Employed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,23405.987980,No,Basic,Bachelor,2/10/11,M,71941,Urban,Married,73,18,89,0,2,Personal Auto,Personal L1,Offer2,Web,198.234764,Four-Door Car,Medsize,Employed
9130,PK87824,California,3096.511217,Yes,Extended,College,2/12/11,F,21604,Suburban,Divorced,79,14,28,0,1,Corporate Auto,Corporate L3,Offer1,Branch,379.200000,Four-Door Car,Medsize,Employed
9131,TD14365,California,8163.890428,No,Extended,Bachelor,2/6/11,M,0,Suburban,Single,85,9,37,3,2,Corporate Auto,Corporate L2,Offer1,Branch,790.784983,Four-Door Car,Medsize,Unemployed
9132,UP19263,California,7524.442436,No,Extended,College,2/3/11,M,21941,Suburban,Married,96,34,3,0,3,Personal Auto,Personal L2,Offer3,Branch,691.200000,Four-Door Car,Large,Employed


In [5]:
customer_df.effective_to_date = pd.to_datetime(customer_df.effective_to_date)

In [6]:
customer_df['month'] = pd.DatetimeIndex(customer_df['effective_to_date']).month
customer_df['day'] = pd.DatetimeIndex(customer_df['effective_to_date']).day
customer_df['week'] = pd.DatetimeIndex(customer_df['effective_to_date']).week
customer_df.head()

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,location_code,marital_status,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,employment_status,month,day,week
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2011-02-24,Employed,F,56274,Suburban,Married,69,32,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,Employed,2,24,8
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,2011-01-31,Unemployed,F,0,Suburban,Single,94,13,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize,Unemployed,1,31,5
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2011-02-19,Employed,F,48767,Suburban,Married,108,18,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize,Employed,2,19,7
3,WW63253,California,7645.861827,No,Basic,Bachelor,2011-01-20,Unemployed,M,0,Suburban,Married,106,18,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize,Unemployed,1,20,3
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2011-02-03,Employed,M,43836,Rural,Single,73,12,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize,Employed,2,3,5


# Data categories separation

In [7]:
categorical = customer_df.select_dtypes(include=np.object)
categorical.columns

Index(['customer', 'state', 'response', 'coverage', 'education',
       'employmentstatus', 'gender', 'location_code', 'marital_status',
       'policy_type', 'policy', 'renew_offer_type', 'sales_channel',
       'vehicle_class', 'vehicle_size', 'employment_status'],
      dtype='object')

In [8]:
numerical = customer_df.select_dtypes(include=np.number)
numerical.columns

Index(['customer_lifetime_value', 'income', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'total_claim_amount',
       'month', 'day', 'week'],
      dtype='object')

In [9]:
numerical = numerical.drop('total_claim_amount',axis=1)
numerical.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month,day,week
0,2763.519279,56274,69,32,5,0,1,2,24,8
1,6979.535903,0,94,13,42,0,8,1,31,5
2,12887.43165,48767,108,18,38,0,2,2,19,7
3,7645.861827,0,106,18,65,0,7,1,20,3
4,2813.692575,43836,73,12,44,0,1,2,3,5


In [10]:
def numerical_types(df, types):
    num_discrete = []
    num_continuous = []
    num = df.select_dtypes(include=np.number)
    for i in num.columns:
        if 'number' in i:
            num_discrete.append(i)
        else:
            num_continuous.append(i)
    if types == 'discrete':
        x = num[num_discrete]
    else:
        x = num[num_continuous]
    
    return x

In [13]:
discrete_df = numerical_types(customer_df,'discrete')
continuous_df = numerical_types(customer_df,'continuous')

In [14]:
categorical = categorical.drop(['customer'],axis=1)
new_categorical_dum = pd.get_dummies(categorical)
new_categorical_dum

Unnamed: 0,state_Arizona,state_California,state_Nevada,state_Oregon,state_Washington,response_No,response_Yes,coverage_Basic,coverage_Extended,coverage_Premium,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employmentstatus_Disabled,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,gender_F,gender_M,location_code_Rural,location_code_Suburban,location_code_Urban,marital_status_Divorced,marital_status_Married,marital_status_Single,policy_type_Corporate Auto,policy_type_Personal Auto,policy_type_Special Auto,policy_Corporate L1,policy_Corporate L2,policy_Corporate L3,policy_Personal L1,policy_Personal L2,policy_Personal L3,policy_Special L1,policy_Special L2,policy_Special L3,renew_offer_type_Offer1,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Large,vehicle_size_Medsize,vehicle_size_Small,employment_status_Disabled,employment_status_Employed,employment_status_Medical Leave,employment_status_Retired,employment_status_Unemployed
0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
2,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
3,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0
9130,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
9131,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
9132,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0


In [15]:
new_categorical = new_categorical_dum

# Scaler

In [16]:
def scaling_methods(df,scaler):
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
    if scaler == 'standard':
        scaler = StandardScaler()
    elif scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'normalize':
        scaler = Normalizer()
    X_scaler = scaler.fit_transform(df)
    X_scaler = pd.DataFrame(X_scaler)
    X_scaler.columns = df.columns
    return X_scaler

In [17]:
standard_continuous_df = scaling_methods(continuous_df,'standard')
standard_continuous_df.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,total_claim_amount,month,day,week
0,-0.762878,0.612827,-0.703925,1.678099,-1.543287,-0.16964,1.075304,0.986776,0.207521
1,-0.149245,-1.239617,0.022691,-0.208186,-0.217334,2.400737,-0.929969,1.795716,-0.13092
2,0.710636,0.36571,0.429596,0.288205,-0.36068,0.455734,1.075304,0.408962,0.094707
3,-0.052263,-1.239617,0.371467,0.288205,0.606907,0.329769,-0.929969,0.524524,-0.356548
4,-0.755575,0.20339,-0.587666,-0.307465,-0.145661,-1.018843,1.075304,-1.440045,-0.13092


In [18]:
standard_discrete_df = scaling_methods(discrete_df,'standard')
standard_discrete_df.head()

Unnamed: 0,number_of_open_complaints,number_of_policies
0,-0.42225,-0.822648
1,-0.42225,2.10616
2,-0.42225,-0.404247
3,-0.42225,1.687759
4,-0.42225,-0.822648


# New scaled DF

In [19]:
new_data = pd.concat([standard_continuous_df, standard_discrete_df, new_categorical], axis=1)
new_data.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,total_claim_amount,month,day,week,number_of_open_complaints,number_of_policies,state_Arizona,state_California,state_Nevada,state_Oregon,state_Washington,response_No,response_Yes,coverage_Basic,coverage_Extended,coverage_Premium,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employmentstatus_Disabled,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,gender_F,gender_M,location_code_Rural,location_code_Suburban,location_code_Urban,marital_status_Divorced,marital_status_Married,marital_status_Single,policy_type_Corporate Auto,policy_type_Personal Auto,policy_type_Special Auto,policy_Corporate L1,policy_Corporate L2,policy_Corporate L3,policy_Personal L1,policy_Personal L2,policy_Personal L3,policy_Special L1,policy_Special L2,policy_Special L3,renew_offer_type_Offer1,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Large,vehicle_size_Medsize,vehicle_size_Small,employment_status_Disabled,employment_status_Employed,employment_status_Medical Leave,employment_status_Retired,employment_status_Unemployed
0,-0.762878,0.612827,-0.703925,1.678099,-1.543287,-0.16964,1.075304,0.986776,0.207521,-0.42225,-0.822648,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
1,-0.149245,-1.239617,0.022691,-0.208186,-0.217334,2.400737,-0.929969,1.795716,-0.13092,-0.42225,2.10616,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
2,0.710636,0.36571,0.429596,0.288205,-0.36068,0.455734,1.075304,0.408962,0.094707,-0.42225,-0.404247,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
3,-0.052263,-1.239617,0.371467,0.288205,0.606907,0.329769,-0.929969,0.524524,-0.356548,-0.42225,1.687759,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
4,-0.755575,0.20339,-0.587666,-0.307465,-0.145661,-1.018843,1.075304,-1.440045,-0.13092,-0.42225,-0.822648,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0


# Model Prediction

In [20]:
def models_errors_process(df,list_models,y):
    from sklearn import linear_model
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.neural_network import MLPRegressor
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.model_selection import train_test_split
    X = df.drop([y],axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, df[y], test_size=0.4, random_state=100)
    errors = []
    for model_name in list_models:
        if model_name == 'linear':
            lm = linear_model.LinearRegression()
            model = lm.fit(X_train,y_train)
            predictions_test = lm.predict(X_test)
            predictions_train = lm.predict(X_train)

        elif model_name == 'kneighbors':
            knn = KNeighborsRegressor(n_neighbors=4)
            model = knn.fit(X_train, y_train)
            predictions_test = knn.predict(X_test)
            predictions_train = knn.predict(X_train)
            
        elif model_name == 'mlpregressor':
            model = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
            predictions_test = model.predict(X_test)
            prediction_train = model.predict(X_test)
        
        MAE_train = mean_absolute_error(y_train,predictions_train)
        MSE_train = mean_squared_error(y_train,predictions_train)
        RMSE_train = np.sqrt(MSE_train)
        R2_train = r2_score(y_train,predictions_train)

        MAE_test = mean_absolute_error(y_test,predictions_test)
        MSE_test = mean_squared_error(y_test,predictions_test)
        RMSE_test = np.sqrt(MSE_test)
        R2_test = r2_score(y_test,predictions_test)
        
        errors.append((MAE_train,MSE_train,RMSE_train,R2_train,
                 MAE_test, MSE_test, RMSE_test, R2_test))

    errors = np.array(errors)
    result_df = pd.DataFrame(errors, columns=['MAE_train','MSE_train','RMSE_train','R2_train','MAE_test','MSE_test','RMSE_test', 'R2_test']).T
    result_df.columns = list_models

    return result_df

f = new_data.copy()
models_errors_process(f,['linear','kneighbors','mlpregressor'],'total_claim_amount')

Unnamed: 0,linear,kneighbors,mlpregressor
MAE_train,0.327608,0.300451,0.300451
MSE_train,0.23567,0.219936,0.219936
RMSE_train,0.485459,0.468973,0.468973
R2_train,0.772989,0.788145,0.788145
MAE_test,0.326064,0.388424,0.367139
MSE_test,0.217117,0.331738,0.25958
RMSE_test,0.465958,0.575967,0.50949
R2_test,0.769416,0.647686,0.724319
