In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [4]:
train['Vehicle_Age'].value_counts()

Vehicle_Age
1-2 Year     5982678
< 1 Year     5044145
> 2 Years     477975
Name: count, dtype: int64

In [5]:
x  = train.iloc[:,[1,2,3,5,6,7,8]]
x.head()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium
0,Male,21,1,0,1-2 Year,Yes,65101.0
1,Male,43,1,0,> 2 Years,Yes,58911.0
2,Female,25,1,1,< 1 Year,No,38043.0
3,Female,35,1,0,1-2 Year,Yes,2630.0
4,Female,36,1,1,1-2 Year,No,31951.0


In [6]:
test = pd.read_csv('test.csv')
test = test.iloc[:,[1,2,3,5,6,7,8]]
test.head()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium
0,Female,20,1,0,< 1 Year,No,2630.0
1,Male,47,1,0,1-2 Year,Yes,37483.0
2,Male,47,1,0,1-2 Year,Yes,2630.0
3,Female,22,1,1,< 1 Year,No,24502.0
4,Male,51,1,0,1-2 Year,No,34115.0


In [7]:
y = train.iloc[:,-1]
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Response, dtype: int64

> **Preprocessing**

In [8]:
bins = [0, 18, 30, 50, 65, np.inf]
labels = ['Teenager', 'Young Adult', 'Adult', 'Senior', 'Elderly']


x['Age'] = pd.cut(x['Age'], bins=bins, labels=labels)
x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Age'] = pd.cut(x['Age'], bins=bins, labels=labels)


Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium
0,Male,Young Adult,1,0,1-2 Year,Yes,65101.0
1,Male,Adult,1,0,> 2 Years,Yes,58911.0
2,Female,Young Adult,1,1,< 1 Year,No,38043.0
3,Female,Adult,1,0,1-2 Year,Yes,2630.0
4,Female,Adult,1,1,1-2 Year,No,31951.0


In [9]:
test['Age'] = pd.cut(test['Age'], bins=bins, labels=labels)
test.head()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium
0,Female,Young Adult,1,0,< 1 Year,No,2630.0
1,Male,Adult,1,0,1-2 Year,Yes,37483.0
2,Male,Adult,1,0,1-2 Year,Yes,2630.0
3,Female,Young Adult,1,1,< 1 Year,No,24502.0
4,Male,Senior,1,0,1-2 Year,No,34115.0


In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer(
    transformers=[
        ('ord_en', OrdinalEncoder(), ['Gender']),
        ('le_en', OrdinalEncoder(), ['Age']),
        ('ohe_en',  OneHotEncoder(), ['Vehicle_Age']),
        ('le_en_v', OrdinalEncoder(), ['Vehicle_Damage']),
        ('Std_scl',StandardScaler(), ['Annual_Premium'])
    ],remainder='passthrough')


In [11]:
x_trf = column_transformer.fit_transform(x)
one_hot_columns = column_transformer.named_transformers_['ohe_en'].get_feature_names_out(['Vehicle_Age'])

# Combine all column names
column_names = ['Gender', 'Age', 'Driving_License',	'Previously_Insured']+ list(one_hot_columns) + ['Vehicle_Damage','Annual_Premium']

# Convert the transformed data to a DataFrame
x_trf= pd.DataFrame(x_trf, columns=column_names)

x_trf.head()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage,Annual_Premium
0,1.0,3.0,1.0,0.0,0.0,1.0,2.105145,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,1.0,1.728962,1.0,0.0
2,0.0,3.0,0.0,1.0,0.0,0.0,0.460756,1.0,1.0
3,0.0,0.0,1.0,0.0,0.0,1.0,-1.691389,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.090529,1.0,1.0


In [12]:
x_trf.describe()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage,Annual_Premium
count,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0
mean,0.541351,1.660848,0.5200159,0.4384384,0.04154571,0.5026798,9.996564000000001e-17,0.998022,0.4629966
std,0.4982872,1.334525,0.4995992,0.4961957,0.1995487,0.4999928,1.0,0.0444312,0.4986289
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.691389,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,-0.3150684,1.0,0.0
50%,1.0,2.0,1.0,0.0,0.0,1.0,0.08281074,1.0,0.0
75%,1.0,3.0,1.0,1.0,0.0,1.0,0.5463245,1.0,1.0
max,1.0,3.0,1.0,1.0,1.0,1.0,30.97609,1.0,1.0


In [13]:
test= column_transformer.fit_transform(test)
one_hot_columns = column_transformer.named_transformers_['ohe_en'].get_feature_names_out(['Vehicle_Age'])

# Combine all column names
column_names = ['Gender', 'Age', 'Driving_License',	'Previously_Insured']+ list(one_hot_columns) + ['Vehicle_Damage','Annual_Premium']

# Convert the transformed data to a DataFrame
test= pd.DataFrame(test, columns=column_names)

test.head()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage,Annual_Premium
0,0.0,3.0,0.0,1.0,0.0,0.0,-1.692555,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.426701,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,-1.692555,1.0,0.0
3,0.0,3.0,0.0,1.0,0.0,0.0,-0.362616,1.0,1.0
4,1.0,2.0,1.0,0.0,0.0,0.0,0.221908,1.0,0.0


In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_trf,y,test_size=0.25,random_state=42)
y_train.shape

(8628598,)

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
dt = RandomForestClassifier(n_estimators=100,n_jobs=-1,verbose=1)
dt.fit(x_train,y_train)

y_pred = dt.predict(x_test)
print('accuracy ', accuracy_score(y_test,y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.4min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    4.6s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   16.5s finished


accuracy  0.8732942771712676


In [18]:
y_pred = dt.predict(test)
test = pd.read_csv('test.csv')
submission_df = pd.DataFrame({
    'id': test['id'],
    'Response': y_pred  
})
submission_df.head()

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   14.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   48.8s finished


Unnamed: 0,id,Response
0,11504798,0
1,11504799,0
2,11504800,0
3,11504801,0
4,11504802,0


In [19]:
submission_df.to_csv('Random Forest_output.csv', index=False)