In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv("/Users/mariasoriano/lab-customer-analysis-round-2/files_for_lab/csv_files/marketing_customer_analysis.csv")
data.shape

(10910, 26)

In [3]:
def standarizate_cols (df):
    
    '''
    Standarizes column names
    - Sets cols to lowercase.
    - Replaces empty space for '_'.
    
    Args:
        df: The dataframe to be standarized.
        
    Returns:
        A df that has been standarized.
    '''
    
    df.columns = data.columns.str.lower().str.replace(' ', '_')
    return df

In [4]:
data_standarized = standarizate_cols(data)

In [5]:
data1 = data.dropna(subset=['response', 'months_since_last_claim', 'vehicle_class', 'vehicle_size', 'vehicle_type'])

In [6]:
numerical = data1.select_dtypes(["number"])
numerical.head()

Unnamed: 0,unnamed:_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
2,2,14947.9173,22139,100,34.0,31,0.0,2,480.0
3,3,22332.43946,49078,97,10.0,3,0.0,2,484.013411
10,10,5154.764074,82730,127,13.0,31,0.0,1,442.521087
13,13,5454.587929,66334,69,25.0,25,0.0,4,331.2
16,16,5326.677654,76717,66,26.0,10,0.0,6,300.528579


In [7]:
categorical = data1.select_dtypes(["object"])
categorical.head()

Unnamed: 0,customer,state,response,coverage,education,effective_to_date,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size,vehicle_type
2,LZ68649,Washington,No,Basic,Bachelor,2/10/11,Employed,M,Suburban,Single,Personal Auto,Personal L3,Offer3,Call Center,SUV,Medsize,A
3,XL78013,Oregon,Yes,Extended,College,1/11/11,Employed,M,Suburban,Single,Corporate Auto,Corporate L3,Offer2,Branch,Four-Door Car,Medsize,A
10,HG93801,Arizona,No,Extended,High School or Below,1/2/11,Employed,M,Urban,Married,Corporate Auto,Corporate L3,Offer2,Branch,SUV,Large,A
13,KR82385,California,No,Basic,Master,1/26/11,Employed,M,Suburban,Single,Personal Auto,Personal L3,Offer4,Call Center,Two-Door Car,Medsize,A
16,FH51383,California,No,Basic,High School or Below,2/7/11,Employed,F,Urban,Married,Personal Auto,Personal L3,Offer4,Call Center,Two-Door Car,Large,A


In [8]:
from sklearn.preprocessing import OneHotEncoder

def hot_encoder(col):
    '''
    Encode categorical features.
    
    Args:
        col: The columns we want to encode/transform.
        
    Returns:
        A DataFrame containing the given columns encoded/transfomed.
    '''
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(col)
    onehotlabels = enc.transform(col).toarray()
    return pd.DataFrame(onehotlabels,columns = enc.categories_)

# Model Improvement
- Concat categorical features
- Define a function to check model's performance
- Apply MinMaxScaler transformation

### Concat categorical features

In [9]:
categorical.drop(['customer'], axis = 1, inplace = True)
concat_categorical = pd.DataFrame()
for col in categorical.columns:
      df_from_column = hot_encoder(categorical[[col]])
      concat_categorical = pd.concat([concat_categorical,df_from_column], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Define a function to check model's perfomance

In [10]:
def standarizate_cols (df):
    
    '''
    Standarizes column names
    - Sets cols to lowercase.
    - Replaces empty space for '_'.
    
    Args:
        df: The dataframe to be standarized.
        
    Returns:
        A df that has been standarized.
    '''
    
    df.columns = data.columns.str.lower().str.replace(' ', '_')
    return df

### Apply MinMaxScaler transformation
Before the results were the following:

**train R2: 0.5249336498476844 / test R2: 0.48896093636449667**

So we can see that the model has improved this time.

In [12]:
def LinearRegressionPerformance(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_true = y_test
    print("The R2 is: ",r2_score(y_true, y_pred))
    print("The MSE is: ",mean_squared_error(y_true, y_pred, squared=True))
    print("The RMSE is: ",mean_squared_error(y_true, y_pred, squared=False))
    print("The MAE is: ",mean_absolute_error(y_true, y_pred))

In [13]:
from sklearn.model_selection import train_test_split

y = numerical[['total_claim_amount']]
X = numerical.drop(['total_claim_amount'], axis=1)

from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X)
minmax_X = MinMaxtransformer.transform(X)
X_minmax = pd.DataFrame(minmax_X,columns=X.columns)

X = pd.concat([concat_categorical,X_minmax], axis=1) 

LinearRegressionPerformance(X,y)

The R2 is:  0.7589350425914109
The MSE is:  20440.654931832883
The RMSE is:  142.97081846248514
The MAE is:  99.58571624400001


