In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
train_file = "/kaggle/input/playground-series-s4e12/train.csv"
test_file = "/kaggle/input/playground-series-s4e12/test.csv"
subm_file = "/kaggle/input/playground-series-s4e12/sample_submission.csv"

In [4]:
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
sub_data = pd.read_csv(subm_file)

In [5]:
sub_data.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [6]:
train_data.shape


(1200000, 21)

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [8]:
(train_data.isnull().mean()*100).sort_values(ascending=False)

Previous Claims         30.335750
Occupation              29.839583
Credit Score            11.490167
Number of Dependents     9.139333
Customer Feedback        6.485333
Health Score             6.173000
Annual Income            3.745750
Age                      1.558750
Marital Status           1.544083
Vehicle Age              0.000500
Insurance Duration       0.000083
id                       0.000000
Property Type            0.000000
Exercise Frequency       0.000000
Smoking Status           0.000000
Policy Type              0.000000
Policy Start Date        0.000000
Location                 0.000000
Education Level          0.000000
Gender                   0.000000
Premium Amount           0.000000
dtype: float64

In [9]:
# segregate columns

train_data['Policy Start Date'] = train_data['Policy Start Date'].astype('datetime64[ns]')

obj_cols = train_data.select_dtypes(include="object").columns.to_list()
num_cols = train_data.select_dtypes(include=['int64','float64']).columns.to_list()
date_cols = ['Policy Start Date']

num_cols, obj_cols , date_cols

(['id',
  'Age',
  'Annual Income',
  'Number of Dependents',
  'Health Score',
  'Previous Claims',
  'Vehicle Age',
  'Credit Score',
  'Insurance Duration',
  'Premium Amount'],
 ['Gender',
  'Marital Status',
  'Education Level',
  'Occupation',
  'Location',
  'Policy Type',
  'Customer Feedback',
  'Smoking Status',
  'Exercise Frequency',
  'Property Type'],
 ['Policy Start Date'])

In [10]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [11]:
# fillna with median values for numerical columns

def dataTreatmentNumericalCols(inp_dataframe):
    data = inp_dataframe.copy()
    for col in num_cols:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        median_age = data[col].median
        data[col] = data[col].fillna(median_age)
    return data

def dataTreatmentCategoryCols(inp_dataframe):
    data = inp_dataframe.copy()
    for col in obj_cols:
        mode_val = data[col].mode()[0]
        data[col] = data[col].fillna(mode_val)
    return data

def dataTreatmentGetDummies(inp_dataframe):
    data = inp_dataframe.copy()
    for col in obj_cols:
        dummies = pd.get_dummies(data[col], drop_first=True, dtype="int", prefix=col)
        data = pd.concat([data, dummies], axis=1)
        data.drop(columns=[col], axis=1, inplace=True)
    return data


In [12]:
# filling missing values with median values for numerical columns

train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
train_data['Annual Income'] = train_data['Annual Income'].fillna(train_data['Annual Income'].median())
train_data['Number of Dependents'] = train_data['Number of Dependents'].fillna(train_data['Number of Dependents'].median())
train_data['Health Score'] = train_data['Health Score'].fillna(train_data['Health Score'].median())
train_data['Previous Claims'] = train_data['Previous Claims'].fillna(train_data['Previous Claims'].median())
train_data['Vehicle Age'] = train_data['Vehicle Age'].fillna(train_data['Vehicle Age'].median())
train_data['Credit Score'] = train_data['Credit Score'].fillna(train_data['Credit Score'].median())
train_data['Insurance Duration'] = train_data['Insurance Duration'].fillna(train_data['Insurance Duration'].median())
train_data['Premium Amount'] = train_data['Premium Amount'].fillna(train_data['Premium Amount'].median())

In [13]:
train_data[num_cols].describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0
mean,599999.5,41.14,32414.31,2.01,25.55,1.0,9.57,593.16,5.02,1102.54
std,346410.31,13.43,31615.6,1.35,11.82,0.82,5.78,141.1,2.59,865.0
min,0.0,18.0,1.0,0.0,2.01,0.0,0.0,300.0,1.0,20.0
25%,299999.75,30.0,8646.0,1.0,16.55,0.0,5.0,484.0,3.0,514.0
50%,599999.5,41.0,23911.0,2.0,24.58,1.0,10.0,595.0,5.0,872.0
75%,899999.25,53.0,43936.0,3.0,33.77,1.0,15.0,706.0,7.0,1509.0
max,1199999.0,64.0,149997.0,4.0,58.98,9.0,19.0,849.0,9.0,4999.0


In [14]:
train_data[obj_cols].nunique().sort_values(ascending=True)

Gender                2
Smoking Status        2
Marital Status        3
Occupation            3
Location              3
Policy Type           3
Customer Feedback     3
Property Type         3
Education Level       4
Exercise Frequency    4
dtype: int64

In [15]:
# use label encoder to label categorical columns

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in obj_cols:
    train_data[col] = le.fit_transform(train_data[col])


In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   id                    1200000 non-null  int64         
 1   Age                   1200000 non-null  float64       
 2   Gender                1200000 non-null  int64         
 3   Annual Income         1200000 non-null  float64       
 4   Marital Status        1200000 non-null  int64         
 5   Number of Dependents  1200000 non-null  float64       
 6   Education Level       1200000 non-null  int64         
 7   Occupation            1200000 non-null  int64         
 8   Health Score          1200000 non-null  float64       
 9   Location              1200000 non-null  int64         
 10  Policy Type           1200000 non-null  int64         
 11  Previous Claims       1200000 non-null  float64       
 12  Vehicle Age           1200000 non-null  fl

In [17]:
train_data.describe()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
count,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,...,1200000.0,1200000.0,1200000.0,1200000.0,1200000,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0
mean,599999.5,41.14,0.5,32414.31,1.03,2.01,1.51,1.59,25.55,1.0,...,1.0,9.57,593.16,5.02,2022-02-13 05:06:30.972380416,1.13,0.5,1.51,1.0,1102.54
min,0.0,18.0,0.0,1.0,0.0,0.0,0.0,0.0,2.01,0.0,...,0.0,0.0,300.0,1.0,2019-08-17 15:21:39.080371,0.0,0.0,0.0,0.0,20.0
25%,299999.75,30.0,0.0,8646.0,0.0,1.0,0.0,1.0,16.55,0.0,...,0.0,5.0,484.0,3.0,2020-11-20 15:21:39.121168896,0.0,0.0,1.0,0.0,514.0
50%,599999.5,41.0,1.0,23911.0,1.0,2.0,2.0,2.0,24.58,1.0,...,1.0,10.0,595.0,5.0,2022-02-14 15:21:39.151731968,1.0,1.0,2.0,1.0,872.0
75%,899999.25,53.0,1.0,43936.0,2.0,3.0,3.0,3.0,33.77,2.0,...,1.0,15.0,706.0,7.0,2023-05-06 15:21:39.182597120,2.0,1.0,3.0,2.0,1509.0
max,1199999.0,64.0,1.0,149997.0,3.0,4.0,3.0,3.0,58.98,2.0,...,9.0,19.0,849.0,9.0,2024-08-15 15:21:39.287115,3.0,1.0,3.0,2.0,4999.0
std,346410.31,13.43,0.5,31615.6,0.85,1.35,1.12,1.14,11.82,0.82,...,0.82,5.78,141.1,2.59,,0.93,0.5,1.12,0.82,865.0


In [18]:
train_data['Policy Age'] = (pd.to_datetime('31-08-2024', format='%d-%m-%Y') - train_data['Policy Start Date']).dt.days

In [19]:
train_data.drop('Policy Start Date', inplace=True, axis=1)

In [20]:
train_data.describe()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Policy Age
count,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,...,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0,1200000.0
mean,599999.5,41.14,0.5,32414.31,1.03,2.01,1.51,1.59,25.55,1.0,...,1.0,9.57,593.16,5.02,1.13,0.5,1.51,1.0,1102.54,929.43
std,346410.31,13.43,0.5,31615.6,0.85,1.35,1.12,1.14,11.82,0.82,...,0.82,5.78,141.1,2.59,0.93,0.5,1.12,0.82,865.0,521.44
min,0.0,18.0,0.0,1.0,0.0,0.0,0.0,0.0,2.01,0.0,...,0.0,0.0,300.0,1.0,0.0,0.0,0.0,0.0,20.0,15.0
25%,299999.75,30.0,0.0,8646.0,0.0,1.0,0.0,1.0,16.55,0.0,...,0.0,5.0,484.0,3.0,0.0,0.0,1.0,0.0,514.0,482.0
50%,599999.5,41.0,1.0,23911.0,1.0,2.0,2.0,2.0,24.58,1.0,...,1.0,10.0,595.0,5.0,1.0,1.0,2.0,1.0,872.0,928.0
75%,899999.25,53.0,1.0,43936.0,2.0,3.0,3.0,3.0,33.77,2.0,...,1.0,15.0,706.0,7.0,2.0,1.0,3.0,2.0,1509.0,1379.0
max,1199999.0,64.0,1.0,149997.0,3.0,4.0,3.0,3.0,58.98,2.0,...,9.0,19.0,849.0,9.0,3.0,1.0,3.0,2.0,4999.0,1840.0


In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
from sklearn.model_selection import RandomizedSearchCV


In [22]:
df = train_data.copy()

df.drop('id', axis=1, inplace=True)

In [23]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=56)
y_train = df_train.pop('Premium Amount')
X_train = df_train
y_test = df_test.pop('Premium Amount')
X_test = df_test

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
import tensorflow
from tensorflow import keras

model = keras.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[-1],)))
model.add(keras.layers.Dense(2, activation="sigmoid"))
model.add(keras.layers.Dense(1, activation="linear"))
model.summary()

In [26]:
model.compile(optimizer=keras.optimizers.SGD(), loss="mean_squared_error")

In [27]:
model.fit(X_train,y_train.values,epochs=10,batch_size=32)

Epoch 1/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1ms/step - loss: 757052.4375
Epoch 2/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1ms/step - loss: 748949.8750
Epoch 3/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - loss: 749611.4375
Epoch 4/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - loss: 749866.1875
Epoch 5/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 1ms/step - loss: 750823.7500
Epoch 6/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1ms/step - loss: 750194.3125
Epoch 7/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - loss: 750522.1250
Epoch 8/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - loss: 746341.8125
Epoch 9/10
[1m28125/28125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 1ms/step - loss: 748357.3750
E

<keras.src.callbacks.history.History at 0x7faa51dec8e0>

In [28]:
y_pred = model.predict(X_test)[:,0]

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step


In [29]:
mean_squared_error(y_test, y_pred)

744388.0069842698

In [30]:
r2_score(y_test, y_pred)

0.0015381471331926688

In [31]:
test_df = test_data.copy()

In [32]:
# treat the test data

test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
test_data['Annual Income'] = test_data['Annual Income'].fillna(test_data['Annual Income'].median())
test_data['Number of Dependents'] = test_data['Number of Dependents'].fillna(test_data['Number of Dependents'].median())
test_data['Health Score'] = test_data['Health Score'].fillna(test_data['Health Score'].median())
test_data['Previous Claims'] = test_data['Previous Claims'].fillna(test_data['Previous Claims'].median())
test_data['Vehicle Age'] = test_data['Vehicle Age'].fillna(test_data['Vehicle Age'].median())
test_data['Credit Score'] = test_data['Credit Score'].fillna(test_data['Credit Score'].median())
test_data['Insurance Duration'] = test_data['Insurance Duration'].fillna(test_data['Insurance Duration'].median())

le = LabelEncoder()
for col in obj_cols:
    test_data[col] = le.fit_transform(test_data[col])



In [33]:
test_data['Policy Start Date'] = test_data['Policy Start Date'].astype('datetime64[ns]')
test_data['Policy Age'] = (pd.to_datetime('31-08-2024', format='%d-%m-%Y') - test_data['Policy Start Date']).dt.days
test_data.drop('Policy Start Date', inplace=True, axis=1)

test_data.drop('id', axis=1, inplace=True)

In [34]:
scaled_pred_data = scaler.transform(test_data)

In [35]:
pred_results = model.predict(test_data)[:,0]

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1ms/step


In [36]:
results_df = pd.DataFrame({
    "id": test_df['id'],
    "Premium Amount": pred_results
})

In [37]:
results_df.to_csv("insurance_premiums_sub.csv", index=False)