In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
#import the data and review features for incorrect datatypes or any additional cleanup
auto_df = pd.read_csv('../exports/subset_auto_data.csv')
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95226 entries, 0 to 95225
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    95226 non-null  int64  
 1   Date_start_contract   95226 non-null  object 
 2   Date_last_renewal     95226 non-null  object 
 3   Date_next_renewal     95226 non-null  object 
 4   Date_birth            95226 non-null  object 
 5   Date_driving_licence  95226 non-null  object 
 6   Distribution_channel  95226 non-null  int64  
 7   Seniority             95226 non-null  int64  
 8   Policies_in_force     95226 non-null  int64  
 9   Max_policies          95226 non-null  int64  
 10  Max_products          95226 non-null  int64  
 11  Lapse                 95226 non-null  int64  
 12  Payment               95226 non-null  int64  
 13  Premium               95226 non-null  float64
 14  Cost_claims_year      95226 non-null  float64
 15  N_claims_year      

In [4]:
#remove date columns as we have already pulled the relevant information from them (creating an age column based on the dates)
auto_df.drop(columns=['Date_start_contract', 'Date_last_renewal', 'Date_next_renewal', 'Date_birth', 'Date_driving_licence'], inplace=True)
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95226 entries, 0 to 95225
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    95226 non-null  int64  
 1   Distribution_channel  95226 non-null  int64  
 2   Seniority             95226 non-null  int64  
 3   Policies_in_force     95226 non-null  int64  
 4   Max_policies          95226 non-null  int64  
 5   Max_products          95226 non-null  int64  
 6   Lapse                 95226 non-null  int64  
 7   Payment               95226 non-null  int64  
 8   Premium               95226 non-null  float64
 9   Cost_claims_year      95226 non-null  float64
 10  N_claims_year         95226 non-null  int64  
 11  N_claims_history      95226 non-null  int64  
 12  R_Claims_history      95226 non-null  float64
 13  Type_risk             95226 non-null  int64  
 14  Area                  95226 non-null  int64  
 15  Second_driver      

In [5]:
#encode the Type of fuel which is a discrete categorical column to binary values of 0 or 1
mapping = {'D': 0, 'P' : 1}
auto_df['Type_fuel'] = auto_df['Type_fuel'].map(mapping)
auto_df['Type_fuel'].value_counts()

Type_fuel
0    63600
1    31626
Name: count, dtype: int64

In [6]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95226 entries, 0 to 95225
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    95226 non-null  int64  
 1   Distribution_channel  95226 non-null  int64  
 2   Seniority             95226 non-null  int64  
 3   Policies_in_force     95226 non-null  int64  
 4   Max_policies          95226 non-null  int64  
 5   Max_products          95226 non-null  int64  
 6   Lapse                 95226 non-null  int64  
 7   Payment               95226 non-null  int64  
 8   Premium               95226 non-null  float64
 9   Cost_claims_year      95226 non-null  float64
 10  N_claims_year         95226 non-null  int64  
 11  N_claims_history      95226 non-null  int64  
 12  R_Claims_history      95226 non-null  float64
 13  Type_risk             95226 non-null  int64  
 14  Area                  95226 non-null  int64  
 15  Second_driver      

In [7]:
auto_df.head()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
0,3,0,15,1,2,1,0,0,380.2,0.0,...,0,2013,85,1229,16030.0,5,1,3.999,1105,40.70089
1,3,0,15,1,2,1,0,0,393.5,0.0,...,0,2013,85,1229,16030.0,5,1,3.999,1105,41.702943
2,3,0,15,1,2,1,0,0,393.5,0.0,...,0,2013,85,1229,16030.0,5,1,3.999,1105,42.702259
3,3,0,15,1,2,1,0,0,395.47,0.0,...,0,2013,85,1229,16030.0,5,1,3.999,1105,43.701574
4,6,0,13,1,2,2,0,0,250.52,0.0,...,0,2006,87,1598,9927.0,4,1,4.25,1055,49.604381


In [8]:
auto_df.corr()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
ID,1.0,-0.141352,-0.131236,-0.041383,-0.072223,-0.062136,0.026554,0.036552,0.018593,0.010053,...,-0.050622,-0.002373,0.032966,0.039695,0.027341,0.004226,-0.038285,0.041228,0.034695,-0.211144
Distribution_channel,-0.141352,1.0,-0.154123,-0.034913,-0.066688,-0.035902,0.033192,0.044633,0.050479,0.005359,...,0.039099,0.021067,0.005357,0.017564,0.005371,0.00917,-0.030005,0.025189,0.018951,0.014236
Seniority,-0.131236,-0.154123,1.0,0.025876,0.241914,0.221048,-0.064314,-0.171295,-0.070993,-0.02267,...,-0.078553,-0.103561,-0.061969,-0.025191,-0.058219,-0.043236,0.054761,-0.048144,-0.044039,0.18973
Policies_in_force,-0.041383,-0.034913,0.025876,1.0,0.775061,0.05789,0.197834,0.050518,-0.050662,0.018259,...,0.070416,-0.057969,0.036666,0.075918,0.082588,-0.022386,-0.029834,0.079602,0.084252,0.028558
Max_policies,-0.072223,-0.066688,0.241914,0.775061,1.0,0.262791,0.176415,0.024031,-0.037896,0.011065,...,0.039764,-0.058188,0.04797,0.087367,0.087161,-0.02624,-0.031938,0.092189,0.095449,0.07775
Max_products,-0.062136,-0.035902,0.221048,0.05789,0.262791,1.0,0.015709,-0.01766,-0.012741,6.5e-05,...,-0.003113,-0.017774,0.015312,0.019306,0.017073,0.000332,0.006552,0.015271,0.014075,0.08283
Lapse,0.026554,0.033192,-0.064314,0.197834,0.176415,0.015709,1.0,0.07399,0.065049,-0.010191,...,-0.015062,-0.114471,-0.003849,0.051477,-0.003142,-0.046332,-0.007886,0.016269,0.010764,-0.024334
Payment,0.036552,0.044633,-0.171295,0.050518,0.024031,-0.01766,0.07399,1.0,0.187958,0.028597,...,0.02953,0.002497,0.036117,0.027865,0.021526,-0.002354,-0.02184,0.037889,0.030398,-0.129698
Premium,0.018593,0.050479,-0.070993,-0.050662,-0.037896,-0.012741,0.065049,0.187958,1.0,0.047511,...,0.107512,0.287014,0.282626,0.123152,0.340708,0.030763,-0.073945,0.195998,0.216801,-0.101747
Cost_claims_year,0.010053,0.005359,-0.02267,0.018259,0.011065,6.5e-05,-0.010191,0.028597,0.047511,1.0,...,0.010297,0.012815,0.019439,0.013736,0.022459,-0.006611,-0.009033,0.018452,0.018193,-0.021823


In [9]:
#Run describe to see the distribution of data to understand if scaling is needed
auto_df.describe()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
count,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,...,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0,95226.0
mean,26561.150862,0.554218,6.655336,1.419654,1.795518,1.048128,0.219908,0.347384,333.527729,165.532681,...,0.136265,2005.195587,99.341524,1719.102724,19551.29772,4.46272,0.332115,4.252007,1266.898137,47.177423
std,15350.049873,0.661895,6.212029,0.879651,1.108,0.231117,0.460368,0.476142,134.34137,1549.798278,...,0.343072,5.678827,30.09155,409.410671,8219.591153,0.869386,0.470975,0.39322,264.044713,12.790737
min,3.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,40.4,0.0,...,0.0,1953.0,12.0,125.0,270.46,0.0,0.0,1.978,137.0,18.075291
25%,13337.25,0.0,3.0,1.0,1.0,1.0,0.0,0.0,254.81,0.0,...,0.0,2002.0,75.0,1422.0,14270.08,4.0,0.0,3.999,1089.0,37.10883
50%,26326.0,0.0,4.0,1.0,1.0,1.0,0.0,0.0,301.65,0.0,...,0.0,2005.0,98.0,1665.0,18330.87,5.0,0.0,4.23,1239.0,46.600958
75%,40000.75,1.0,9.0,2.0,2.0,1.0,0.0,1.0,370.71,0.0,...,0.0,2008.0,112.0,1956.0,23000.0,5.0,1.0,4.443,1399.0,56.410678
max,53502.0,3.0,40.0,17.0,17.0,4.0,7.0,1.0,2993.34,260853.24,...,1.0,2018.0,580.0,6788.0,220675.8,6.0,1.0,8.218,5455.0,98.529774


In [15]:
#realized I had an errant value in Distribution channel
#subset df to exclude category of 3 which was used to represent missing values
auto_df = auto_df[auto_df['Distribution_channel'] != 3]
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92194 entries, 0 to 95225
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    92194 non-null  int64  
 1   Distribution_channel  92194 non-null  int64  
 2   Seniority             92194 non-null  int64  
 3   Policies_in_force     92194 non-null  int64  
 4   Max_policies          92194 non-null  int64  
 5   Max_products          92194 non-null  int64  
 6   Lapse                 92194 non-null  int64  
 7   Payment               92194 non-null  int64  
 8   Premium               92194 non-null  float64
 9   Cost_claims_year      92194 non-null  float64
 10  N_claims_year         92194 non-null  int64  
 11  N_claims_history      92194 non-null  int64  
 12  R_Claims_history      92194 non-null  float64
 13  Type_risk             92194 non-null  int64  
 14  Area                  92194 non-null  int64  
 15  Second_driver         92

In [11]:
#run describe again to confirm things are looking as they should.
auto_df.describe()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
count,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,...,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0,92194.0
mean,27046.920429,0.473784,6.577673,1.417923,1.789173,1.047335,0.220882,0.349188,333.862704,167.190353,...,0.136224,2005.20292,99.406057,1719.937881,19570.46007,4.462069,0.331052,4.253175,1267.699384,47.054487
std,15303.454875,0.499315,6.144852,0.87781,1.103817,0.229065,0.460588,0.476716,134.280356,1572.591026,...,0.343028,5.666898,30.059321,408.547611,8203.110028,0.870156,0.470594,0.393921,263.699429,12.764341
min,3.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,40.53,0.0,...,0.0,1953.0,12.0,125.0,270.46,0.0,0.0,1.978,137.0,18.075291
25%,13846.5,0.0,3.0,1.0,1.0,1.0,0.0,0.0,255.1,0.0,...,0.0,2002.0,75.0,1422.0,14291.0,4.0,0.0,3.999,1089.0,37.021218
50%,27237.0,0.0,4.0,1.0,1.0,1.0,0.0,0.0,302.09,0.0,...,0.0,2005.0,98.0,1685.0,18340.0,5.0,0.0,4.235,1240.0,46.428474
75%,40446.0,1.0,9.0,2.0,2.0,1.0,0.0,1.0,371.1375,0.0,...,0.0,2008.0,112.0,1968.0,23030.0,5.0,1.0,4.448,1400.0,56.243669
max,53502.0,1.0,40.0,17.0,17.0,4.0,7.0,1.0,2993.34,260853.24,...,1.0,2018.0,580.0,6788.0,220675.8,6.0,1.0,8.218,5455.0,98.529774


In [12]:
#since you are just looking to predict passenger cars, lets subset the data by that risk type

In [33]:
auto_df_sub = auto_df[auto_df['Type_risk'] == 3]
auto_df_sub.drop(columns=['Type_risk'], inplace=True)
print(auto_df_sub['N_doors'].value_counts())
auto_df_sub.info()

N_doors
5    53758
3    13136
4    10863
2     1743
6       70
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 79570 entries, 0 to 95225
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    79570 non-null  int64  
 1   Distribution_channel  79570 non-null  int64  
 2   Seniority             79570 non-null  int64  
 3   Policies_in_force     79570 non-null  int64  
 4   Max_policies          79570 non-null  int64  
 5   Max_products          79570 non-null  int64  
 6   Lapse                 79570 non-null  int64  
 7   Payment               79570 non-null  int64  
 8   Premium               79570 non-null  float64
 9   Cost_claims_year      79570 non-null  float64
 10  N_claims_year         79570 non-null  int64  
 11  N_claims_history      79570 non-null  int64  
 12  R_Claims_history      79570 non-null  float64
 13  Area                  79570 non-null 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  auto_df_sub.drop(columns=['Type_risk'], inplace=True)


In [34]:
# Additionally, you will only be looking at cars with 4 or 5 doors (which would be either sedan or hatchbacks)
auto_df_sub = auto_df_sub[(auto_df_sub['N_doors'] == 4) | (auto_df_sub['N_doors'] == 5)]
auto_df_sub['N_doors'].value_counts()

N_doors
5    53758
4    10863
Name: count, dtype: int64

In [35]:
auto_df_sub.corr()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
ID,1.0,-0.032234,-0.122575,-0.045663,-0.076755,-0.06649,0.028191,0.029619,0.004007,0.00942,...,-0.051039,-0.024025,0.03136,0.050535,0.026788,-0.034606,-0.045603,0.053029,0.036978,-0.207231
Distribution_channel,-0.032234,1.0,-0.280167,-0.056723,-0.125496,-0.066324,0.059692,0.086029,0.071059,0.01143,...,0.057626,0.022927,0.008903,0.016242,0.004151,-0.011407,-0.045749,0.009949,0.004497,-0.028744
Seniority,-0.122575,-0.280167,1.0,0.040362,0.267334,0.222197,-0.063754,-0.163647,-0.05393,-0.022894,...,-0.07696,-0.073657,-0.04285,-0.026411,-0.040631,0.006634,0.059204,-0.038498,-0.035131,0.179091
Policies_in_force,-0.045663,-0.056723,0.040362,1.0,0.770935,0.052147,0.171389,0.044945,-0.054152,0.021712,...,0.080629,-0.051692,0.028408,0.056185,0.075818,-0.039094,-0.034525,0.036197,0.05471,0.044434
Max_policies,-0.076755,-0.125496,0.267334,0.770935,1.0,0.267565,0.156257,0.017977,-0.03693,0.012476,...,0.048058,-0.045414,0.044586,0.063579,0.084392,-0.041979,-0.027347,0.052217,0.070177,0.087739
Max_products,-0.06649,-0.066324,0.222197,0.052147,0.267565,1.0,0.012313,-0.013272,-0.005058,0.001987,...,-0.004064,-0.011806,0.024965,0.022486,0.026477,-0.004803,0.009016,0.024163,0.021429,0.082574
Lapse,0.028191,0.059692,-0.063754,0.171389,0.156257,0.012313,1.0,0.069387,0.066926,-0.008506,...,-0.016449,-0.11561,-0.001942,0.058984,0.001022,-0.040826,-0.021266,0.017422,0.01371,-0.019197
Payment,0.029619,0.086029,-0.163647,0.044945,0.017977,-0.013272,0.069387,1.0,0.174642,0.029381,...,0.024759,-0.006941,0.019629,0.026384,0.009822,-0.017437,-0.027067,0.034803,0.026098,-0.120105
Premium,0.004007,0.071059,-0.05393,-0.054152,-0.03693,-0.005058,0.066926,0.174642,1.0,0.046224,...,0.109582,0.288364,0.260006,0.105378,0.317765,0.010059,-0.073249,0.182998,0.223132,-0.09549
Cost_claims_year,0.00942,0.01143,-0.022894,0.021712,0.012476,0.001987,-0.008506,0.029381,0.046224,1.0,...,0.011899,0.011911,0.013831,0.011439,0.017578,0.000135,-0.011552,0.015939,0.018799,-0.018018


In [113]:
auto_df_sub.Premium.describe()

count    64621.000000
mean       337.405497
std        138.665053
min         40.710000
25%        255.880000
50%        302.560000
75%        374.970000
max       2993.340000
Name: Premium, dtype: float64

In [131]:
test = auto_df_sub['Premium'].quantile([.05,.95])
print(test)
auto_df_sub[auto_df_sub['Premium'] > 498.80].shape

0.05    202.96
0.95    598.68
Name: Premium, dtype: float64


(6462, 24)

In [135]:
cut_auto_df_sub = auto_df_sub[(auto_df_sub['Premium'] <= test[.95]) & (auto_df_sub['Premium'] >= test[.05])]
print(cut_auto_df_sub.shape)
cut_auto_df_sub.corr()


(58160, 24)


Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
ID,1.0,-0.032803,-0.120961,-0.042161,-0.073153,-0.063847,0.029065,0.030573,0.027782,0.009608,...,-0.050121,-0.025692,0.034306,0.054258,0.030241,-0.035733,-0.047829,0.054206,0.038379,-0.207169
Distribution_channel,-0.032803,1.0,-0.281441,-0.056711,-0.129597,-0.06879,0.058272,0.083791,0.078334,0.011241,...,0.052795,0.016537,0.009244,0.016464,0.000808,-0.015421,-0.048669,0.007756,0.001299,-0.025958
Seniority,-0.120961,-0.281441,1.0,0.038277,0.27303,0.218034,-0.06478,-0.165165,-0.074925,-0.023553,...,-0.075069,-0.070616,-0.045725,-0.029434,-0.045126,0.00749,0.059789,-0.039151,-0.037312,0.173797
Policies_in_force,-0.042161,-0.056711,0.038277,1.0,0.759481,0.050573,0.183247,0.060584,-0.013779,0.021719,...,0.089098,-0.046625,0.030014,0.061817,0.084297,-0.040015,-0.033916,0.040912,0.062838,0.042242
Max_policies,-0.073153,-0.129597,0.27303,0.759481,1.0,0.271317,0.162269,0.025985,-0.016665,0.011975,...,0.052266,-0.038149,0.04744,0.067459,0.092614,-0.042007,-0.027571,0.055618,0.076576,0.086408
Max_products,-0.063847,-0.06879,0.218034,0.050573,0.271317,1.0,0.011638,-0.011999,-0.022888,-0.00237,...,-0.00539,-0.015978,0.025705,0.024119,0.024406,-0.008158,0.007305,0.024791,0.02136,0.081926
Lapse,0.029065,0.058272,-0.06478,0.183247,0.162269,0.011638,1.0,0.066557,0.067608,-0.007321,...,-0.020372,-0.120243,-0.0058,0.05775,-0.002933,-0.038635,-0.020275,0.014397,0.010337,-0.018212
Payment,0.030573,0.083791,-0.165165,0.060584,0.025985,-0.011999,0.066557,1.0,0.202832,0.029098,...,0.018743,-0.018541,0.016696,0.025245,0.007037,-0.02112,-0.022815,0.030841,0.019905,-0.119888
Premium,0.027782,0.078334,-0.074925,-0.013779,-0.016665,-0.022888,0.067608,0.202832,1.0,0.029622,...,0.127392,0.253571,0.203327,0.073609,0.246747,0.003312,-0.074472,0.153309,0.182306,-0.11517
Cost_claims_year,0.009608,0.011241,-0.023553,0.021719,0.011975,-0.00237,-0.007321,0.029098,0.029622,1.0,...,0.009869,0.005216,0.006116,0.007157,0.009329,-0.00098,-0.009871,0.011124,0.013253,-0.016775


In [134]:
cut_auto_df_sub.describe()

Unnamed: 0,ID,Distribution_channel,Seniority,Policies_in_force,Max_policies,Max_products,Lapse,Payment,Premium,Cost_claims_year,...,Second_driver,Year_matriculation,Power,Cylinder_capacity,Value_vehicle,N_doors,Type_fuel,Length,Weight,Age
count,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,...,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0,58160.0
mean,27303.032325,0.47206,6.387895,1.372558,1.739305,1.045839,0.215715,0.359732,323.40598,153.478819,...,0.146217,2005.556482,100.567056,1685.109044,19592.972126,4.830055,0.337208,4.230295,1245.692074,46.977918
std,15290.86787,0.499223,6.068774,0.803856,1.037174,0.224062,0.447748,0.479926,85.332321,1429.256498,...,0.353327,5.377061,27.848077,369.723736,7475.07358,0.375588,0.472761,0.298047,216.782522,12.64731
min,3.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,202.96,0.0,...,0.0,1953.0,25.0,400.0,270.46,4.0,0.0,1.978,349.0,18.135524
25%,14182.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,260.27,0.0,...,0.0,2002.0,80.0,1399.0,14606.0,5.0,0.0,4.02,1090.0,37.01232
50%,27504.0,0.0,4.0,1.0,1.0,1.0,0.0,0.0,302.555,0.0,...,0.0,2005.0,100.0,1598.0,18450.0,5.0,0.0,4.249,1232.0,46.260096
75%,40716.0,1.0,8.0,2.0,2.0,1.0,0.0,1.0,364.745,0.0,...,0.0,2009.0,114.0,1896.0,22850.0,5.0,1.0,4.436,1371.0,56.019849
max,53502.0,1.0,40.0,16.0,17.0,4.0,5.0,1.0,598.68,236285.18,...,1.0,2018.0,580.0,5967.0,129040.0,5.0,1.0,5.26,2643.0,98.529774


In [43]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [53]:
#create the X and Y variables for your training and testing
X = auto_df_sub.drop(columns=['Premium'])
X = sm.add_constant(X)
y = auto_df_sub['Premium']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

In [54]:
#run an OLS model without scaling the data to see what the initial prediction is with all features
model1 = sm.OLS(y_train, X_train)

model1_res = model1.fit()

model1_res.summary()

0,1,2,3
Dep. Variable:,Premium,R-squared:,0.242
Model:,OLS,Adj. R-squared:,0.242
Method:,Least Squares,F-statistic:,672.5
Date:,"Thu, 05 Sep 2024",Prob (F-statistic):,0.0
Time:,09:11:39,Log-Likelihood:,-301040.0
No. Observations:,48465,AIC:,602100.0
Df Residuals:,48441,BIC:,602300.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9364.3759,245.470,-38.149,0.000,-9845.500,-8883.252
ID,4.896e-06,3.71e-05,0.132,0.895,-6.78e-05,7.76e-05
Distribution_channel,9.8386,1.167,8.433,0.000,7.552,12.125
Seniority,0.1555,0.117,1.324,0.185,-0.075,0.386
Policies_in_force,-14.4047,1.105,-13.031,0.000,-16.571,-12.238
Max_policies,-4.7617,0.956,-4.979,0.000,-6.636,-2.887
Max_products,-5.2929,2.590,-2.043,0.041,-10.370,-0.216
Lapse,27.7936,1.291,21.525,0.000,25.263,30.325
Payment,43.7661,1.181,37.054,0.000,41.451,46.081

0,1,2,3
Omnibus:,29238.208,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,696451.825
Skew:,2.474,Prob(JB):,0.0
Kurtosis:,20.9,Cond. No.,16300000.0


<h1> MODEL 2

In [80]:
#recreate X and y, but scale before adding a constant to see if the results change at all
X = auto_df_sub.drop(columns=['Premium'])
y = auto_df_sub['Premium']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

In [81]:
#Scale the X data for model preparation
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [82]:
#add a constant to both the X_train_scaled and X_test_scaled variables to rerun the OLS model
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

In [83]:
#train a model on the dataset with the constant
model2 = sm.OLS(y_train, X_train_scaled)
model2_res = model2.fit()

model2_res.summary()

0,1,2,3
Dep. Variable:,Premium,R-squared:,0.242
Model:,OLS,Adj. R-squared:,0.242
Method:,Least Squares,F-statistic:,672.5
Date:,"Thu, 05 Sep 2024",Prob (F-statistic):,0.0
Time:,10:55:01,Log-Likelihood:,-301040.0
No. Observations:,48465,AIC:,602100.0
Df Residuals:,48441,BIC:,602300.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,336.9329,0.548,614.897,0.000,335.859,338.007
x1,0.0750,0.568,0.132,0.895,-1.039,1.189
x2,4.9093,0.582,8.433,0.000,3.768,6.050
x3,0.9512,0.718,1.324,0.185,-0.456,2.359
x4,-12.0839,0.927,-13.031,0.000,-13.901,-10.266
x5,-5.0297,1.010,-4.979,0.000,-7.010,-3.050
x6,-1.2099,0.592,-2.043,0.041,-2.370,-0.049
x7,12.3990,0.576,21.525,0.000,11.270,13.528
x8,20.9281,0.565,37.054,0.000,19.821,22.035

0,1,2,3
Omnibus:,29238.208,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,696451.825
Skew:,2.474,Prob(JB):,0.0
Kurtosis:,20.9,Cond. No.,5.97


<h1> Model 3

In [84]:
#Select the features that show the highest correlation & make most sense for predicting premium & run an 
# OLS model on it

X = auto_df_sub[['R_Claims_history', 'Year_matriculation', 'Value_vehicle']]
y = auto_df_sub['Premium']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

In [85]:
print(X_train_scaled)

[[ 1.         -0.6163156  -0.85207402 -0.99077381]
 [ 1.         -0.6163156   1.83132795 -1.36115702]
 [ 1.         -0.4901758   0.22128677 -0.62939993]
 ...
 [ 1.         -0.6163156   2.01022142  1.28213403]
 [ 1.         -0.6163156  -1.56764788  0.42357825]
 [ 1.         -0.6163156   1.83132795  1.3591437 ]]


In [86]:

model3 = sm.OLS(y_train, X_train_scaled)
model3_res = model3.fit()

model3_res.summary()

0,1,2,3
Dep. Variable:,Premium,R-squared:,0.17
Model:,OLS,Adj. R-squared:,0.17
Method:,Least Squares,F-statistic:,3317.0
Date:,"Thu, 05 Sep 2024",Prob (F-statistic):,0.0
Time:,10:55:06,Log-Likelihood:,-303230.0
No. Observations:,48465,AIC:,606500.0
Df Residuals:,48461,BIC:,606500.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,336.9329,0.573,587.855,0.000,335.810,338.056
x1,17.5473,0.575,30.523,0.000,16.421,18.674
x2,34.1942,0.584,58.587,0.000,33.050,35.338
x3,36.5306,0.584,62.580,0.000,35.386,37.675

0,1,2,3
Omnibus:,28551.4,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,625184.636
Skew:,2.42,Prob(JB):,0.0
Kurtosis:,19.916,Cond. No.,1.22


<h1> Model 4

In [87]:
#lets try using Lasso Regression with the original features included

In [91]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [92]:
X = auto_df_sub.drop(columns=['Premium'])
y = auto_df_sub['Premium']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [152]:
model4 = Lasso(alpha = .141747)
model4.fit(X_train_scaled, y_train)

y_train_pred = model4.predict(X_train_scaled)

training_mse = mean_squared_error(y_train, y_train_pred)
print(np.sqrt(training_mse))
r2 = r2_score(y_train, y_train_pred)
print(r2)
print(training_mse)

120.60285879332116
0.24200478026393057
14545.049549121763


<h1> Model 4.5

In [104]:
#try implementing GridSearchCV to determine how much it effects the model
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': np.logspace(-4, 4, 100)}

lasso_mod = Lasso()
grid_search = GridSearchCV(lasso_mod, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

best_alpha = grid_search.best_params_['alpha']

print(best_alpha)

0.14174741629268062


<h1> Model 5 - Ridge Regression

In [106]:
from sklearn.linear_model import Ridge

In [107]:
X = auto_df_sub.drop(columns=['Premium'])
y = auto_df_sub['Premium']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [108]:
param_grid = {'alpha': np.logspace(-4, 4, 100)}

ridge_mod = Ridge()
grid_search = GridSearchCV(ridge_mod, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)


In [109]:
best_alpha = grid_search.best_params_['alpha']
print(best_alpha)

138.48863713938746


In [111]:
ridge = Ridge(alpha=138.488)

ridge.fit(X_train_scaled, y_train)
y_train_pred = ridge.predict(X_train_scaled)

print(r2_score(y_train, y_train_pred))

0.24203347702826417


<h1> Model 6 - Random Forest Regression

In [160]:
from sklearn.ensemble import RandomForestRegressor

In [161]:
X = auto_df_sub.drop(columns=['Premium'])
y = auto_df_sub['Premium']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40)

rf = RandomForestRegressor(random_state=47, n_estimators=1000)

rf.fit(X_train, y_train)

y_pred_train = rf.predict(X_train)

In [162]:
mse = mean_squared_error(y_train, y_pred_train)
print(r2_score(y_train, y_pred_train))
print(np.sqrt(mse))
print(mse)

0.9449953589152541
32.35884358115055
1047.0947579093684


In [164]:
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(r2_score(y_test, y_pred))
print(np.sqrt(mse))
print(mse)

0.5749065922716776
91.74329785083972
8416.832700547893


<h1> Model 6.5 Grid Search with Random Forest Regressor

In [156]:
X = auto_df_sub.drop(columns=['Premium'])
y = auto_df_sub['Premium']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=45)

param_grid = {'n_estimators' : [100, 200, 300, 500, 1000],
              'max_depth' : [5, 10, 15, 20, 25, 30, None]}

rf_model = RandomForestRegressor()
grid_search = GridSearchCV(rf_model, param_grid, cv=5)

grid_search.fit(X_train, y_train)


In [158]:
best_rf = grid_search.best_estimator_

feature_importances = best_rf.feature_importances_


importance_df = pd.DataFrame({
    'Feature' : X.columns,
    'Importance': feature_importances
})

# Sort by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df


Unnamed: 0,Feature,Importance
17,Value_vehicle,0.15467
14,Year_matriculation,0.128611
22,Age,0.121584
0,ID,0.085314
11,R_Claims_history,0.061122
21,Weight,0.054609
20,Length,0.053521
15,Power,0.048825
16,Cylinder_capacity,0.047924
2,Seniority,0.042417


In [159]:
grid_search.best_params_


{'max_depth': None, 'n_estimators': 1000}

In [165]:
#Although the best parameters indicate no max_depth and large estimator, I will need to play around with the
#hyperparameters to try and find a good bias-variance trade off without overfitting the model.
#submitting this now but will conitnue to work on this model.