# Training and Testing Model Predictions

In [1]:
#importing packages
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#importing datasets
insurance_data_train = pd.read_csv('Data/train.csv')
insurance_data_test = pd.read_csv('Data/test.csv')

insurance_data_train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1
1,2,Male,76,1,3,0,1-2 Year,No,33536,26,183,0
2,3,Male,47,1,28,0,> 2 Years,Yes,38294,26,27,1
3,4,Male,21,1,11,1,< 1 Year,No,28619,152,203,0
4,5,Female,29,1,41,1,< 1 Year,No,27496,152,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...
381104,381105,Male,74,1,26,1,1-2 Year,No,30170,26,88,0
381105,381106,Male,30,1,37,1,< 1 Year,No,40016,152,131,0
381106,381107,Male,21,1,30,1,< 1 Year,No,35118,160,161,0
381107,381108,Female,68,1,14,0,> 2 Years,Yes,44617,124,74,0


In [2]:
insurance_data_test

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,7.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,297
...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265


In [3]:
#changing datatypes to integers and appending them to the dataset

insurance_data_train['Vehicle_Age_Encoded']  = insurance_data_train['Vehicle_Age'] \
                                            .replace({'1-2 Year': 1, '< 1 Year': 0, '> 2 Years': 3})

insurance_data_train['Gender_Encoded']  = insurance_data_train['Gender'] \
                                            .replace({'Male': 1, 'Female': 2})

insurance_data_train['Vehicle_Damage_Enoded']  = insurance_data_train['Vehicle_Damage'] \
                                            .replace({'Yes': 1, 'No': 0})

#dropping the changed colums
insurance_data_train.drop(['Gender', 'Vehicle_Age', 'Vehicle_Damage'],axis=1, inplace=True)

insurance_data_train

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Vehicle_Age_Encoded,Gender_Encoded,Vehicle_Damage_Enoded
0,1,44,1,28,0,40454,26,217,1,3,1,1
1,2,76,1,3,0,33536,26,183,0,1,1,0
2,3,47,1,28,0,38294,26,27,1,3,1,1
3,4,21,1,11,1,28619,152,203,0,0,1,0
4,5,29,1,41,1,27496,152,39,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
381104,381105,74,1,26,1,30170,26,88,0,1,1,0
381105,381106,30,1,37,1,40016,152,131,0,0,1,0
381106,381107,21,1,30,1,35118,160,161,0,0,1,0
381107,381108,68,1,14,0,44617,124,74,0,3,2,1


In [4]:
insurance_data_test['Vehicle_Age_Encoded']  = insurance_data_test['Vehicle_Age'] \
                                            .replace({'1-2 Year': 1, '< 1 Year': 0, '> 2 Years': 3})

insurance_data_test['Gender_Encoded']  = insurance_data_test['Gender'] \
                                            .replace({'Male': 1, 'Female': 2})

insurance_data_test['Vehicle_Damage_Enoded']  = insurance_data_test['Vehicle_Damage'] \
                                            .replace({'Yes': 1, 'No': 0})

insurance_data_test.drop(['Gender', 'Vehicle_Age', 'Vehicle_Damage'],axis=1, inplace=True)

insurance_data_test

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_Encoded,Gender_Encoded,Vehicle_Damage_Enoded
0,381110,25,1,11.0,1,35786.0,152.0,53,0,1,0
1,381111,40,1,28.0,0,33762.0,7.0,111,1,1,1
2,381112,47,1,28.0,0,40050.0,124.0,199,1,1,1
3,381113,24,1,27.0,1,37356.0,152.0,187,0,1,1
4,381114,27,1,28.0,1,59097.0,152.0,297,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,26,1,37.0,1,30867.0,152.0,56,0,2,0
127033,508143,38,1,28.0,0,28700.0,122.0,165,1,2,1
127034,508144,21,1,46.0,1,29802.0,152.0,74,0,1,0
127035,508145,71,1,28.0,1,62875.0,26.0,265,1,1,0


In [5]:
#seperating input data from output data, also removing Driving license and id column 
X_train = insurance_data_train.drop(columns=['id','Driving_License','Response'])
X_train

Unnamed: 0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_Encoded,Gender_Encoded,Vehicle_Damage_Enoded
0,44,28,0,40454,26,217,3,1,1
1,76,3,0,33536,26,183,1,1,0
2,47,28,0,38294,26,27,3,1,1
3,21,11,1,28619,152,203,0,1,0
4,29,41,1,27496,152,39,0,2,0
...,...,...,...,...,...,...,...,...,...
381104,74,26,1,30170,26,88,1,1,0
381105,30,37,1,40016,152,131,0,1,0
381106,21,30,1,35118,160,161,0,1,0
381107,68,14,0,44617,124,74,3,2,1


In [6]:
X_test = insurance_data_test.drop(columns=['id','Driving_License'])
X_test

Unnamed: 0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_Encoded,Gender_Encoded,Vehicle_Damage_Enoded
0,25,11.0,1,35786.0,152.0,53,0,1,0
1,40,28.0,0,33762.0,7.0,111,1,1,1
2,47,28.0,0,40050.0,124.0,199,1,1,1
3,24,27.0,1,37356.0,152.0,187,0,1,1
4,27,28.0,1,59097.0,152.0,297,0,1,0
...,...,...,...,...,...,...,...,...,...
127032,26,37.0,1,30867.0,152.0,56,0,2,0
127033,38,28.0,0,28700.0,122.0,165,1,2,1
127034,21,46.0,1,29802.0,152.0,74,0,1,0
127035,71,28.0,1,62875.0,26.0,265,1,1,0


In [7]:
#seperating output data from input data
y_train = insurance_data_train['Response']

y_train

0         1
1         0
2         1
3         0
4         0
         ..
381104    0
381105    0
381106    0
381107    0
381108    0
Name: Response, Length: 381109, dtype: int64

In [8]:
#initializing DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [9]:
#inputing data fromm the test dataset to test model
predictions = model.predict(X_test)

#model can now predict response based on input data
predict = pd.DataFrame(predictions)
insurance_data_test['Response'] = predict
insurance_data_test

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_Encoded,Gender_Encoded,Vehicle_Damage_Enoded,Response
0,381110,25,1,11.0,1,35786.0,152.0,53,0,1,0,0
1,381111,40,1,28.0,0,33762.0,7.0,111,1,1,1,0
2,381112,47,1,28.0,0,40050.0,124.0,199,1,1,1,0
3,381113,24,1,27.0,1,37356.0,152.0,187,0,1,1,0
4,381114,27,1,28.0,1,59097.0,152.0,297,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,26,1,37.0,1,30867.0,152.0,56,0,2,0,0
127033,508143,38,1,28.0,0,28700.0,122.0,165,1,2,1,1
127034,508144,21,1,46.0,1,29802.0,152.0,74,0,1,0,0
127035,508145,71,1,28.0,1,62875.0,26.0,265,1,1,0,0


In [11]:
y_test = insurance_data_test['Response']

y_test

0         0
1         0
2         0
3         0
4         0
         ..
127032    0
127033    1
127034    0
127035    0
127036    0
Name: Response, Length: 127037, dtype: int64

In [13]:
#testing the accuracy of the prediction, of which score range from 0 - 1
# 0 representing least accuracy and 1 representing highest accuracy.
score = accuracy_score(y_test, predictions)

score

1.0