# Import dependencies and load files

In [497]:
import pandas as pd
import urchin_clean as uc
import urchin_prep as up

In [498]:
df=pd.read_csv(r"C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\Data_Marketing_Customer_Analysis_Round3.csv")

# Data cleanup

## Remove duplicate records

In [499]:
df=up.e_remove_duplicates(df)

Rows before drop 10602
Rows after drop 10602


## Split numerical and categorical data

In [500]:
numericals=df._get_numeric_data()
categoricals=df.select_dtypes(object)

## Check missing values

In [501]:
uc.a_count_missing_values(numericals)

Unnamed: 0,column,# missing values,% missing values
0,customer_lifetime_value,0,0.0
1,income,0,0.0
2,monthly_premium_auto,0,0.0
3,months_since_last_claim,0,0.0
4,months_since_policy_inception,0,0.0
5,number_of_open_complaints,0,0.0
6,number_of_policies,0,0.0
7,total_claim_amount,0,0.0


In [502]:
uc.a_count_missing_values(categoricals)

Unnamed: 0,column,# missing values,% missing values
0,region,0,0.0
1,response,0,0.0
2,coverage,0,0.0
3,education,0,0.0
4,effective_to_date,0,0.0
5,employment_status,0,0.0
6,gender,0,0.0
7,location_code,0,0.0
8,marital_status,0,0.0
9,policy_type,0,0.0


## Check categoricals for consistent naming

In [503]:
uc.c_check_categorical(categoricals)

{'region': array(['arizona', 'california', 'washington', 'oregon', 'nevada'],
       dtype=object),
 'response': array(['no', 'yes'], dtype=object),
 'coverage': array(['basic', 'extended', 'premium'], dtype=object),
 'education': array(['college', 'bachelor', 'high school or below', 'doctor', 'master'],
       dtype=object),
 'effective_to_date': array(['2/18/11', '1/18/11', '2/10/11', '1/11/11', '1/17/11', '2/14/11',
        '2/24/11', '1/19/11', '1/4/11', '1/2/11', '2/7/11', '1/31/11',
        '1/26/11', '2/28/11', '1/16/11', '2/26/11', '2/23/11', '1/15/11',
        '2/2/11', '2/15/11', '1/24/11', '2/21/11', '2/22/11', '1/7/11',
        '1/28/11', '2/8/11', '2/12/11', '2/20/11', '1/5/11', '2/19/11',
        '1/3/11', '2/3/11', '1/22/11', '1/23/11', '2/5/11', '2/13/11',
        '1/25/11', '2/16/11', '2/1/11', '1/27/11', '1/12/11', '1/20/11',
        '2/6/11', '2/11/11', '1/21/11', '1/29/11', '1/9/11', '2/9/11',
        '2/27/11', '1/1/11', '2/17/11', '2/25/11', '1/13/11', '1/6/11',
 

## Drop column(s) that aren't needed

In [504]:
uc.b_drop_by_input(categoricals,['effective_to_date'])

Unnamed: 0,region,response,coverage,education,employment_status,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,arizona,no,basic,college,employed,m,suburban,married,corporate auto,corporate l3,offer3,agent,four-door car,medsize
1,california,no,basic,college,unemployed,f,suburban,single,personal auto,personal l3,offer4,call center,four-door car,medsize
2,washington,no,basic,bachelor,employed,m,suburban,single,personal auto,personal l3,offer3,call center,suv,medsize
3,oregon,yes,extended,college,employed,m,suburban,single,corporate auto,corporate l3,offer2,branch,four-door car,medsize
4,oregon,no,premium,bachelor,medical leave,f,suburban,married,personal auto,personal l2,offer1,branch,four-door car,medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10597,nevada,no,premium,bachelor,unemployed,f,suburban,married,personal auto,personal l1,offer3,web,luxury car,medsize
10598,oregon,no,basic,college,employed,f,urban,married,personal auto,personal l3,offer2,branch,four-door car,medsize
10599,arizona,no,extended,bachelor,employed,f,rural,married,corporate auto,corporate l3,offer1,web,luxury suv,medsize
10600,california,no,premium,college,employed,f,urban,divorced,personal auto,personal l1,offer1,branch,suv,medsize


# Remove columns not correlated with DV

In [505]:
def remove_IV_not_correlated_with_DV(numeric_dataframe,y_name,thresh,inplace=False):
    c= abs(numeric_dataframe.corr())
    #c

    #fig, ax = plt.subplots(figsize=(14,14))
    #sns.heatmap(c, annot=True);

    #dependent variable
    c_last = c[y_name].sort_values(ascending=False)
    #c_last
    c_thr = thresh
    cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
    cols_to_remove = list(c_last[c_last < c_thr].index)[1:]
    print("Features to drop",len(cols_to_remove))
    
    if inplace==False:
        return cols_to_keep, cols_to_remove
    else:
        print("Dropped:",cols_to_remove)
        return numeric_dataframe[cols_to_keep]

In [506]:
numericals_updated=remove_IV_not_correlated_with_DV(numericals,"total_claim_amount",.2,inplace=True)

Features to drop 4
Dropped: ['number_of_open_complaints', 'number_of_policies', 'months_since_last_claim', 'months_since_policy_inception']


# Do X-Y data split

In [507]:
def prep_xy(df_used,y_name,size_val=.3):
    
    #Prep X-Y Split
    y=df_used[y_name]
    x=df_used.drop(labels=y_name,axis=1)

    #Create train-test data using 'size_val' percent of test data
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=size_val,random_state=123)

    #Create transformer to normalize data
    transformer=col_transformer(x_train)

    #Normalize/scale data
    x_train_scaled=transformer.fit_transform(x_train)
    x_test_scaled=transformer.transform(x_test)
    feature_names=transformer.get_feature_names_out()
    x_train_scaled=pd.DataFrame(data=x_train_scaled,columns=feature_names)
    x_test_scaled=pd.DataFrame(data=x_test_scaled,columns=feature_names)
    return x_train,y_train,x_train_scaled,x_test_scaled,y_test,x_test

#Create ColumnTransformer to enable transforming of both numerical & categorical data
def col_transformer(x): 

    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)

    numerical_columns = numerical_columns_selector(x)
    categorical_columns = categorical_columns_selector(x)


    categorical_preprocessor = OneHotEncoder(drop='first',handle_unknown="ignore")
    numerical_preprocessor = StandardScaler()


    preprocessor = ColumnTransformer([('cat', categorical_preprocessor, categorical_columns),
                                      ('num', numerical_preprocessor, numerical_columns)],sparse_threshold=0)
    return preprocessor

In [508]:
df_combined=pd.concat([numericals_updated,categoricals],axis=1)

In [509]:
x_train,y_train,x_train_scaled,x_test_scaled,y_test,x_test=prep_xy(df_combined,'total_claim_amount',size_val=.3)

In [510]:
def sm_OLS(x_train_scaled,x_test_scaled,y_train):
    y_train=y_train.values.reshape(-1,1)
    x_train_const_scaled = sm.add_constant(x_train_scaled) # adding a constant so that formula understands y=mx+b that b<>0
    model = sm.OLS(y_train, x_train_const_scaled).fit()
    y_pred_train = model.predict(x_train_const_scaled) 

    x_test_const_scaled = sm.add_constant(x_test_scaled) # adding a constant so that formula understands y=mx+b that b<>0
    y_pred_test = model.predict(x_test_const_scaled) 
    return model,y_pred_train,y_pred_test

In [511]:
def gather_results(model,y_pred_train,y_train):
    m_mse=mse(y_train,y_pred_train)
    rmse=math.sqrt(mse(y_train,y_pred_train))
    r_mae=mae(y_train,y_pred_train)
    r_squared=model.rsquared
    r_squared_adj=model.rsquared_adj
    df_data={'MSE':[round(m_mse,2)],'RMSE':[round(rmse,3)],'R-Squared':[round(r_squared,3)],'R-Squared-Adj':[round(r_squared_adj,3)]}
    temp_df=pd.DataFrame(df_data)
    #print(model.summary())
    return temp_df


# Check regression models

## No data changes

In [512]:
model,y_pred_train,y_pred_test=sm_OLS(x_train_scaled,x_test_scaled,y_train)
gather_results(model,y_pred_train,y_train)

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


Unnamed: 0,MSE,RMSE,R-Squared,R-Squared-Adj
0,19416.35,139.343,0.775,0.774


### Standard result

In [513]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.775
Model:,OLS,Adj. R-squared:,0.774
Method:,Least Squares,F-statistic:,591.4
Date:,"Mon, 11 Jul 2022",Prob (F-statistic):,0.0
Time:,16:19:50,Log-Likelihood:,-47167.0
No. Observations:,7421,AIC:,94420.0
Df Residuals:,7377,BIC:,94730.0
Df Model:,43,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,141.7397,14.577,9.723,0.000,113.164,170.316
cat__region_california,8.2844,4.667,1.775,0.076,-0.864,17.433
cat__region_nevada,9.0637,6.390,1.418,0.156,-3.462,21.590
cat__region_oregon,-0.1275,4.824,-0.026,0.979,-9.583,9.328
cat__region_washington,7.8825,6.693,1.178,0.239,-5.238,21.003
cat__response_yes,-26.1795,5.212,-5.023,0.000,-36.397,-15.962
cat__coverage_extended,-15.6104,4.587,-3.403,0.001,-24.603,-6.618
cat__coverage_premium,-19.8357,8.754,-2.266,0.023,-36.996,-2.675
cat__education_college,-9.9342,4.226,-2.351,0.019,-18.219,-1.649

0,1,2,3
Omnibus:,3152.632,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38170.024
Skew:,1.7,Prob(JB):,0.0
Kurtosis:,13.577,Cond. No.,1.24e+16


## Using RFE

In [494]:
from sklearn.feature_selection import RFE
def use_RFE_linear_regression(x_train,x_test,n_features=5,lm_type=LinearRegression(),inplace=False):
    lm = lm_type

    selector = RFE(lm, n_features_to_select= n_features, step = 1) # Step is how many features to add or drop everytime
    selector.fit(x_train, y_train)

    kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
    kept_features = list(x_train.iloc[:,kept_features].columns)
    removed_features=x_train.copy()
    removed_features.drop(labels=kept_features,axis=1,inplace=True)
    print("Features to drop:",removed_features.columns)
    print("Count",len(removed_features.columns))

    if inplace==True:
        x_train = selector.transform(x_train)
        x_test  = selector.transform(x_test)

        x_train = pd.DataFrame(x_train, columns=kept_features)
        x_test  = pd.DataFrame(x_test, columns=kept_features)
        return x_train, x_test
    else:
        return kept_features, removed_features
    


### RFE Model result

In [495]:
#Run RFE
x_train_scaled, x_test_scaled=use_RFE_linear_regression(x_train_scaled,x_test_scaled,n_features=15,lm_type=LinearRegression(),inplace=True)
#Run OLS
model,y_pred_train,y_pred_test=sm_OLS(x_train_scaled,x_test_scaled,y_train)
#Get results
gather_results(model,y_pred_train,y_train)

Features to drop: Index(['cat__region_california', 'cat__region_nevada', 'cat__region_oregon',
       'cat__region_washington', 'cat__education_college',
       'cat__education_doctor', 'cat__education_master',
       'cat__employment_status_medical leave', 'cat__marital_status_married',
       'cat__policy_type_personal auto', 'cat__policy_type_special auto',
       'cat__policy_corporate l2', 'cat__policy_corporate l3',
       'cat__policy_personal l1', 'cat__policy_personal l2',
       'cat__policy_personal l3', 'cat__policy_special l2',
       'cat__policy_special l3', 'cat__renew_offer_type_offer2',
       'cat__renew_offer_type_offer3', 'cat__renew_offer_type_offer4',
       'cat__sales_channel_branch', 'cat__sales_channel_call center',
       'cat__sales_channel_web', 'cat__vehicle_class_luxury car',
       'cat__vehicle_class_luxury suv', 'cat__vehicle_class_two-door car',
       'cat__vehicle_size_medsize', 'cat__vehicle_size_small',
       'num__customer_lifetime_value'],
   

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


Unnamed: 0,MSE,RMSE,R-Squared,R-Squared-Adj
0,19505.26,139.661,0.774,0.774


In [496]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.774
Model:,OLS,Adj. R-squared:,0.774
Method:,Least Squares,F-statistic:,1692.0
Date:,"Mon, 11 Jul 2022",Prob (F-statistic):,0.0
Time:,16:19:39,Log-Likelihood:,-47184.0
No. Observations:,7421,AIC:,94400.0
Df Residuals:,7405,BIC:,94510.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,131.5828,7.218,18.230,0.000,117.434,145.732
cat__response_yes,-23.8664,5.043,-4.733,0.000,-33.752,-13.981
cat__coverage_extended,-14.4676,3.861,-3.747,0.000,-22.037,-6.899
cat__coverage_premium,-17.5994,6.572,-2.678,0.007,-30.482,-4.717
cat__education_high school or below,18.1178,3.633,4.986,0.000,10.995,25.240
cat__employment_status_employed,-21.0904,5.908,-3.570,0.000,-32.671,-9.510
cat__employment_status_retired,-14.0204,11.390,-1.231,0.218,-36.347,8.307
cat__employment_status_unemployed,69.5935,6.447,10.794,0.000,56.955,82.232
cat__gender_m,16.4263,3.268,5.027,0.000,10.021,22.832

0,1,2,3
Omnibus:,3167.382,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38348.003
Skew:,1.71,Prob(JB):,0.0
Kurtosis:,13.598,Cond. No.,19.9


## Using Lasso

In [514]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso(alpha=0.05)

model.fit(x_train_scaled, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(x_train_scaled, y_train)}, Test -> {model.score(x_test_scaled, y_test)}")

Lasso: Train -> 0.7750970668457369, Test -> 0.7572609920013418


In [515]:
lasso_df=pd.DataFrame(model.feature_names_in_,columns=['col_name'])

In [516]:
lasso_df['coef']=abs(model.coef_)

In [517]:
lasso_sorted=lasso_df.sort_values(by='coef', ascending=False)
lasso_sorted_drop=lasso_sorted.loc[lasso_sorted['coef']<1]
        


In [518]:
lasso_sorted_drop

for x in lasso_sorted_drop['col_name']:
    x_train_scaled.drop(x,axis=1,inplace=True)
    x_test_scaled.drop(x,axis=1,inplace=True)

In [519]:
#Run OLS after lasso changes made
model,y_pred_train,y_pred_test=sm_OLS(x_train_scaled,x_test_scaled,y_train)
#See results
gather_results(model,y_pred_train,y_train)

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


Unnamed: 0,MSE,RMSE,R-Squared,R-Squared-Adj
0,19417.78,139.348,0.775,0.774


In [520]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.775
Model:,OLS,Adj. R-squared:,0.774
Method:,Least Squares,F-statistic:,795.8
Date:,"Mon, 11 Jul 2022",Prob (F-statistic):,0.0
Time:,16:20:02,Log-Likelihood:,-47167.0
No. Observations:,7421,AIC:,94400.0
Df Residuals:,7388,BIC:,94630.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,136.9424,10.779,12.704,0.000,115.812,158.073
cat__region_california,8.4513,3.649,2.316,0.021,1.299,15.604
cat__region_nevada,9.1848,5.682,1.616,0.106,-1.955,20.324
cat__region_washington,8.0306,6.026,1.333,0.183,-3.783,19.844
cat__response_yes,-26.1047,5.187,-5.033,0.000,-36.272,-15.937
cat__coverage_extended,-14.8049,3.981,-3.719,0.000,-22.608,-7.002
cat__coverage_premium,-17.9990,6.994,-2.573,0.010,-31.709,-4.289
cat__education_college,-9.9440,4.220,-2.356,0.018,-18.217,-1.671
cat__education_doctor,-10.5385,8.921,-1.181,0.238,-28.027,6.950

0,1,2,3
Omnibus:,3154.582,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38278.505
Skew:,1.701,Prob(JB):,0.0
Kurtosis:,13.594,Cond. No.,25.4


In [522]:
def top_features(model):
    features_importance = pd.DataFrame(data={
    'Importance': abs(model.params[1:]),
    'Coefficient': model.params[1:], 'P-value': round(model.pvalues[1:],4)})
    features_importance = features_importance.sort_values(by='Importance', ascending=False)
    return features_importance

keep_going=True

while keep_going==True:
    model,y_pred_train,y_pred_test=sm_OLS(x_train_scaled,x_test_scaled,y_train)
    #remove stepwise feature.  if done is written, stop removing
    features_importance=top_features(model)
    temp_features=features_importance.sort_values(by='P-value', ascending=False)
    worst_feature=temp_features.iloc[0,:]
    print(worst_feature)
    worst_feature=str(worst_feature.name)
    if temp_features.iloc[0,2]<=.05:
        keep_going=False
    else:
        x_train_scaled.drop(labels=worst_feature,axis=1,inplace=True)
        x_test_scaled.drop(labels=worst_feature,axis=1,inplace=True)




  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


Importance     2.721009
Coefficient    2.721009
P-value        0.626200
Name: cat__renew_offer_type_offer4, dtype: float64
Importance     5.747291
Coefficient    5.747291
P-value        0.591200
Name: cat__employment_status_medical leave, dtype: float64
Importance     9.685931
Coefficient   -9.685931
P-value        0.519000
Name: cat__vehicle_class_luxury suv, dtype: float64
Importance     3.546251
Coefficient   -3.546251
P-value        0.441300
Name: cat__policy_type_personal auto, dtype: float64
Importance     3.775099
Coefficient   -3.775099
P-value        0.298500
Name: cat__sales_channel_branch, dtype: float64
Importance     5.294681
Coefficient    5.294681
P-value        0.258300
Name: cat__renew_offer_type_offer3, dtype: float64
Importance     5.474667
Coefficient   -5.474667
P-value        0.256400
Name: cat__marital_status_married, dtype: float64
Importance     10.554034
Coefficient   -10.554034
P-value         0.236300
Name: cat__education_doctor, dtype: float64
Importance   

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


### Lasso model result

In [523]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.774
Model:,OLS,Adj. R-squared:,0.774
Method:,Least Squares,F-statistic:,1694.0
Date:,"Mon, 11 Jul 2022",Prob (F-statistic):,0.0
Time:,16:21:18,Log-Likelihood:,-47181.0
No. Observations:,7421,AIC:,94390.0
Df Residuals:,7405,BIC:,94500.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,127.3001,6.900,18.448,0.000,113.774,140.827
cat__response_yes,-27.4798,4.981,-5.517,0.000,-37.243,-17.717
cat__coverage_extended,-14.6633,3.858,-3.801,0.000,-22.227,-7.100
cat__coverage_premium,-17.1981,6.568,-2.618,0.009,-30.073,-4.323
cat__education_high school or below,17.6488,3.635,4.855,0.000,10.523,24.775
cat__employment_status_employed,-19.8268,5.416,-3.661,0.000,-30.444,-9.209
cat__employment_status_unemployed,71.9185,5.956,12.076,0.000,60.244,83.593
cat__gender_m,16.2372,3.266,4.972,0.000,9.835,22.639
cat__location_code_suburban,379.8030,4.641,81.841,0.000,370.706,388.900

0,1,2,3
Omnibus:,3159.35,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38168.081
Skew:,1.706,Prob(JB):,0.0
Kurtosis:,13.574,Cond. No.,20.3


In [524]:
gather_results(model,y_pred_train,y_train)

Unnamed: 0,MSE,RMSE,R-Squared,R-Squared-Adj
0,19487.71,139.598,0.774,0.774
