In [170]:
import pandas as pd 
import numpy as np
import warnings
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

warnings.filterwarnings('ignore')

In [171]:
df = pd.read_csv('insurance_claims.csv')
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [172]:
df.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', '_c39'],
      dtype='object')

In [173]:
input_columns = ['months_as_customer', 'age', 'umbrella_limit', 'insured_zip', 'auto_make', 'auto_model', 'auto_year','policy_annual_premium']
df1 = df[input_columns]
df1.head()

Unnamed: 0,months_as_customer,age,umbrella_limit,insured_zip,auto_make,auto_model,auto_year,policy_annual_premium
0,328,48,0,466132,Saab,92x,2004,1406.91
1,228,42,5000000,468176,Mercedes,E400,2007,1197.22
2,134,29,5000000,430632,Dodge,RAM,2007,1413.14
3,256,41,6000000,608117,Chevrolet,Tahoe,2014,1415.74
4,228,44,6000000,610706,Accura,RSX,2009,1583.91


In [174]:
df1.isnull().sum()

months_as_customer       0
age                      0
umbrella_limit           0
insured_zip              0
auto_make                0
auto_model               0
auto_year                0
policy_annual_premium    0
dtype: int64

In [175]:
# numeric_cols = ['months_as_customer', 'age', 'umbrella_limit', 'auto_year', 'policy_annual_premium']
numeric_cols = ['months_as_customer', 'age', 'umbrella_limit', 'auto_year']
scaler = MinMaxScaler()
df1[numeric_cols] = scaler.fit_transform(df1[numeric_cols])
df1.head()

Unnamed: 0,months_as_customer,age,umbrella_limit,insured_zip,auto_make,auto_model,auto_year,policy_annual_premium
0,0.68476,0.644444,0.090909,466132,Saab,92x,0.45,1406.91
1,0.475992,0.511111,0.545455,468176,Mercedes,E400,0.6,1197.22
2,0.279749,0.222222,0.545455,430632,Dodge,RAM,0.6,1413.14
3,0.534447,0.488889,0.636364,608117,Chevrolet,Tahoe,0.95,1415.74
4,0.475992,0.555556,0.636364,610706,Accura,RSX,0.7,1583.91


In [176]:
categorical_cols = ['auto_make', 'auto_model']
le = LabelEncoder()
df1[categorical_cols] = df1[categorical_cols].apply(lambda x: le.fit_transform(x))
df1.head()

Unnamed: 0,months_as_customer,age,umbrella_limit,insured_zip,auto_make,auto_model,auto_year,policy_annual_premium
0,0.68476,0.644444,0.090909,466132,10,1,0.45,1406.91
1,0.475992,0.511111,0.545455,468176,8,12,0.6,1197.22
2,0.279749,0.222222,0.545455,430632,4,30,0.6,1413.14
3,0.534447,0.488889,0.636364,608117,3,34,0.95,1415.74
4,0.475992,0.555556,0.636364,610706,0,31,0.7,1583.91


In [177]:
X = df1.iloc[:,:-1]
X

Unnamed: 0,months_as_customer,age,umbrella_limit,insured_zip,auto_make,auto_model,auto_year
0,0.684760,0.644444,0.090909,466132,10,1,0.45
1,0.475992,0.511111,0.545455,468176,8,12,0.60
2,0.279749,0.222222,0.545455,430632,4,30,0.60
3,0.534447,0.488889,0.636364,608117,3,34,0.95
4,0.475992,0.555556,0.636364,610706,0,31,0.70
...,...,...,...,...,...,...,...
995,0.006263,0.422222,0.090909,431289,6,6,0.55
996,0.594990,0.488889,0.090909,608177,13,28,1.00
997,0.271399,0.333333,0.363636,442797,11,19,0.05
998,0.956159,0.955556,0.545455,441714,1,5,0.15


In [178]:
Y = df1.iloc[:,-1]
Y

0      1406.91
1      1197.22
2      1413.14
3      1415.74
4      1583.91
        ...   
995    1310.80
996    1436.79
997    1383.49
998    1356.92
999     766.19
Name: policy_annual_premium, Length: 1000, dtype: float64

In [179]:
sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[ 1.07813958,  0.99083647, -0.47947582, ...,  0.85646614,
        -1.64574255, -0.1834404 ],
       [ 0.2089946 ,  0.33407345,  1.69798022, ...,  0.36053902,
        -0.65747047,  0.31549088],
       [-0.60800168, -1.08891308,  1.69798022, ..., -0.63131523,
         0.95970204,  0.31549088],
       ...,
       [-0.64276748, -0.54161057,  0.8269978 , ...,  1.1044297 ,
        -0.02857005, -1.5139238 ],
       [ 2.20802805,  2.52328351,  1.69798022, ..., -1.37520591,
        -1.28637088, -1.18130295],
       [ 2.19064515,  2.3043625 , -0.47947582, ...,  0.36053902,
        -0.65747047,  0.31549088]])

In [185]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=43)

In [192]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse,r2)

63037.16238638945 -0.15117768985182134


In [188]:
print("Accuracy Score for RandomForestRegressor is : ",rfr.score(X_test,y_test)*100,"%")

Accuracy Score for RandomForestRegressor is :  -17.224999724272337 %


In [193]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse,r2)

55214.07176461426 -0.008313273870097504
