# Import, Load and check

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_csv('playground-series-s4e12/train.csv')
print(df.shape)
df.head()

(1200000, 21)


Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [4]:
df = df.drop(columns=['id', 'Policy Start Date'])

# Check NULL

In [5]:
df.isnull().any()

Age                      True
Gender                  False
Annual Income            True
Marital Status           True
Number of Dependents     True
Education Level         False
Occupation               True
Health Score             True
Location                False
Policy Type             False
Previous Claims          True
Vehicle Age              True
Credit Score             True
Insurance Duration       True
Customer Feedback        True
Smoking Status          False
Exercise Frequency      False
Property Type           False
Premium Amount          False
dtype: bool

In [6]:
null_counts = df.isnull().sum()
print(null_counts.sort_values(ascending=False))

Previous Claims         364029
Occupation              358075
Credit Score            137882
Number of Dependents    109672
Customer Feedback        77824
Health Score             74076
Annual Income            44949
Age                      18705
Marital Status           18529
Vehicle Age                  6
Insurance Duration           1
Property Type                0
Exercise Frequency           0
Smoking Status               0
Policy Type                  0
Gender                       0
Location                     0
Education Level              0
Premium Amount               0
dtype: int64


# Preprocessing

In [7]:
class AgeBinner(BaseEstimator, TransformerMixin):
    def __init__(self, bins, labels):
        self.bins = bins
        self.labels = labels
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['Age'] = pd.cut(X['Age'], bins=self.bins, labels=self.labels, include_lowest=True)
        return X

In [8]:
age_bins = [0, 18, 35, 50, 100]
age_labels = ['Child', 'Young Adult', 'Middle Aged', 'Senior']

In [9]:
age_binner = ColumnTransformer(
    transformers=[
        ('age_binner', AgeBinner(bins=age_bins, labels=age_labels), ['Age']),
    ]
)

In [10]:
numerical_columns = df.select_dtypes(include=['number']).columns.drop('Premium Amount')
print(numerical_columns)

Index(['Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration'],
      dtype='object')


In [11]:
non_numerical_columns = df.select_dtypes(exclude=['number']).columns
print(non_numerical_columns)

Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type', 'Customer Feedback', 'Smoking Status',
       'Exercise Frequency', 'Property Type'],
      dtype='object')


In [12]:
numeric_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, non_numerical_columns)
    ]
    )

In [14]:
pipeline = Pipeline(
    steps=[
           ('preprocessor', preprocessor)
        ])

In [15]:
y_train = df['Premium Amount']
X_train = df.drop(columns=['Premium Amount'])

In [16]:
X_train

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,Good,Yes,Weekly,House
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,Premium,,5.0,372.0,3.0,Poor,No,Daily,Apartment
1199996,54.0,Male,35786.0,Divorced,,Master's,Self-Employed,11.483482,Rural,Comprehensive,,10.0,597.0,4.0,Poor,No,Weekly,Apartment
1199997,19.0,Male,51884.0,Divorced,0.0,Master's,,14.724469,Suburban,Basic,0.0,19.0,,6.0,Good,No,Monthly,Condo
1199998,55.0,Male,,Single,1.0,PhD,,18.547381,Suburban,Premium,1.0,7.0,407.0,4.0,Poor,No,Daily,Apartment


In [17]:
pipeline.fit(X_train)

In [18]:
X_train_transformed = pipeline.transform(X_train)

In [19]:
X_train_transformed

array([[1.90000000e+01, 1.00490000e+04, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [3.90000000e+01, 3.16780000e+04, 3.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [2.30000000e+01, 2.56020000e+04, 3.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [1.90000000e+01, 5.18840000e+04, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.50000000e+01, 3.27452178e+04, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.10000000e+01, 3.27452178e+04, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [20]:
onehot_columns = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(non_numerical_columns)
all_columns = numerical_columns.tolist() + list(onehot_columns)


In [21]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=all_columns)

In [22]:
X_train_transformed.isnull().any()

Age                            False
Annual Income                  False
Number of Dependents           False
Health Score                   False
Previous Claims                False
Vehicle Age                    False
Credit Score                   False
Insurance Duration             False
Gender_Female                  False
Gender_Male                    False
Marital Status_Divorced        False
Marital Status_Married         False
Marital Status_Single          False
Education Level_Bachelor's     False
Education Level_High School    False
Education Level_Master's       False
Education Level_PhD            False
Occupation_Employed            False
Occupation_Self-Employed       False
Occupation_Unemployed          False
Location_Rural                 False
Location_Suburban              False
Location_Urban                 False
Policy Type_Basic              False
Policy Type_Comprehensive      False
Policy Type_Premium            False
Customer Feedback_Average      False
C

In [23]:
X_train_transformed['Age'] = age_binner.fit_transform(X_train_transformed[['Age']])

In [24]:
X_train_transformed

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Gender_Female,Gender_Male,...,Customer Feedback_Poor,Smoking Status_No,Smoking Status_Yes,Exercise Frequency_Daily,Exercise Frequency_Monthly,Exercise Frequency_Rarely,Exercise Frequency_Weekly,Property Type_Apartment,Property Type_Condo,Property Type_House
0,Young Adult,10049.000000,1.000000,22.598761,2.000000,17.0,372.00000,5.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,Middle Aged,31678.000000,3.000000,15.569731,1.000000,12.0,694.00000,2.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,Young Adult,25602.000000,3.000000,47.177549,1.000000,14.0,592.92435,3.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,Young Adult,141855.000000,2.000000,10.938144,1.000000,0.0,367.00000,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Young Adult,39651.000000,1.000000,20.376094,0.000000,8.0,598.00000,4.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,Middle Aged,27316.000000,0.000000,13.772907,1.002689,5.0,372.00000,3.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1199996,Senior,35786.000000,2.009934,11.483482,1.002689,10.0,597.00000,4.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1199997,Young Adult,51884.000000,0.000000,14.724469,0.000000,19.0,592.92435,6.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1199998,Senior,32745.217777,1.000000,18.547381,1.000000,7.0,407.00000,4.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
age_encoder = OneHotEncoder()

# Fit and transform the Age column
age_encoded = age_encoder.fit_transform(X_train_transformed[['Age']])

# Convert the encoded Age column to a DataFrame
age_encoded_df = pd.DataFrame(age_encoded.toarray(), columns=age_encoder.get_feature_names_out(['Age']))

# Concatenate the encoded Age columns with the original DataFrame
X_train_encoded = pd.concat([X_train_transformed.reset_index(drop=True), age_encoded_df.reset_index(drop=True)], axis=1).drop(columns=['Age'])

# Display the first few rows to verify the changes
print(X_train_encoded.head())

   Annual Income  Number of Dependents  Health Score  Previous Claims  \
0        10049.0                   1.0     22.598761              2.0   
1        31678.0                   3.0     15.569731              1.0   
2        25602.0                   3.0     47.177549              1.0   
3       141855.0                   2.0     10.938144              1.0   
4        39651.0                   1.0     20.376094              0.0   

   Vehicle Age  Credit Score  Insurance Duration  Gender_Female  Gender_Male  \
0         17.0     372.00000                 5.0            1.0          0.0   
1         12.0     694.00000                 2.0            1.0          0.0   
2         14.0     592.92435                 3.0            0.0          1.0   
3          0.0     367.00000                 1.0            0.0          1.0   
4          8.0     598.00000                 4.0            0.0          1.0   

   Marital Status_Divorced  ...  Exercise Frequency_Monthly  \
0                

In [26]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

numerical_columns = X_train_encoded.select_dtypes(include=['number']).columns
# Apply the scaler to the numerical columns in the training and validation sets
X_train_encoded[numerical_columns] = scaler.fit_transform(X_train_encoded[numerical_columns])

# Display the first few rows to verify the changes
print(X_train_encoded.head())

   Annual Income  Number of Dependents  Health Score  Previous Claims  \
0       0.066988                  0.25      0.361397         0.222222   
1       0.211186                  0.75      0.238002         0.111111   
2       0.170678                  0.75      0.792879         0.111111   
3       0.945719                  0.50      0.156695         0.111111   
4       0.264340                  0.25      0.322378         0.000000   

   Vehicle Age  Credit Score  Insurance Duration  Gender_Female  Gender_Male  \
0     0.894737      0.131148               0.500            1.0          0.0   
1     0.631579      0.717668               0.125            1.0          0.0   
2     0.736842      0.533560               0.250            0.0          1.0   
3     0.000000      0.122040               0.000            0.0          1.0   
4     0.421053      0.542805               0.375            0.0          1.0   

   Marital Status_Divorced  ...  Exercise Frequency_Monthly  \
0                

# Train validation split

In [27]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_encoded, y_train, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

X_train shape: (960000, 41)
X_val shape: (240000, 41)
y_train shape: (960000,)
y_val shape: (240000,)


# Build the model

In [54]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=1)
rf_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.2min finished


In [52]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Fit the model to the training data
linear_model.fit(X_train, y_train)

In [51]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Define the degree of the polynomial
degree = 2

# Create a pipeline that combines polynomial features with linear regression
polynomial_model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

# Fit the model to the training data
polynomial_model.fit(X_train, y_train)

In [35]:
from sklearn.tree import DecisionTreeRegressor

decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)

In [36]:
from sklearn.svm import LinearSVR

svr_model = LinearSVR(random_state=42)
svr_model.fit(X_train, y_train)

In [31]:
from sklearn.ensemble import HistGradientBoostingRegressor

gradient_boosting_model = HistGradientBoostingRegressor(random_state=42)
gradient_boosting_model.fit(X_train, y_train)

# Evaluate the model

In [32]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_log_error

In [38]:
# Make predictions on the validation set
y_pred = svr_model.predict(X_val)

# Calculate metrics
mae = mean_absolute_error(y_val, y_pred)
rmsle = root_mean_squared_log_error(y_val, y_pred)

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("root_mean_squared_log_error:", rmsle)

Mean Absolute Error (MAE): 637.9441112188375
root_mean_squared_log_error: 1.1010435072334779


In [37]:
y_pred = polynomial_model.predict(X_val)

# Calculate metrics
mae = mean_absolute_error(y_val, y_pred)
rmsle = root_mean_squared_log_error(y_val, y_pred)

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("root_mean_squared_log_error:", rmsle)

Mean Absolute Error (MAE): 665.1751338541667
root_mean_squared_log_error: 1.1657172801217484


In [49]:
# Make predictions on the validation set
y_pred_linear = gradient_boosting_model.predict(X_val)

# Calculate metrics
mae_linear = mean_absolute_error(y_val, y_pred_linear)
rmsle_linear = root_mean_squared_log_error(y_val, y_pred_linear)

# Print the metrics
print("Linear Regression Mean Absolute Error (MAE):", mae_linear)
print("Linear Regression Root Mean Squared Log Error (RMSLE):", rmsle_linear)

Linear Regression Mean Absolute Error (MAE): 644.969429921919
Linear Regression Root Mean Squared Log Error (RMSLE): 1.1482031205613867


In [None]:
# Make predictions on the validation set
y_pred_ensemble = ensemble_model.predict(X_val)

# Calculate metrics
mae_ensemble = mean_absolute_error(y_val, y_pred_ensemble)
rmsle_ensemble = root_mean_squared_log_error(y_val, y_pred_ensemble)

# Print the metrics
print("Ensemble Mean Absolute Error (MAE):", mae_ensemble)
print("Ensemble Root Mean Squared Log Error (RMSLE):", rmsle_ensemble)

# Prep the test data

In [39]:
df_test = pd.read_csv('playground-series-s4e12/test.csv')

In [40]:
df_test.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [41]:
X_test_transformed = pipeline.transform(df_test)

In [42]:
X_test_transformed = pd.DataFrame(X_test_transformed, columns=all_columns)

In [43]:
X_test_transformed['Age'] = age_binner.transform(X_test_transformed[['Age']])

In [44]:
age_encoded = age_encoder.transform(X_test_transformed[['Age']])

# Convert the encoded Age column to a DataFrame
age_encoded_df = pd.DataFrame(age_encoded.toarray(), columns=age_encoder.get_feature_names_out(['Age']))

# Concatenate the encoded Age columns with the original DataFrame
X_test_encoded = pd.concat([X_test_transformed.reset_index(drop=True), age_encoded_df.reset_index(drop=True)], axis=1).drop(columns=['Age'])

# Display the first few rows to verify the changes
print(X_test_encoded.head())

   Annual Income  Number of Dependents  Health Score  Previous Claims  \
0         2310.0                   4.0      7.657981         1.002689   
1       126031.0                   2.0     13.381379         1.002689   
2        17092.0                   0.0     24.354527         1.002689   
3        30424.0                   3.0      5.136225         1.000000   
4        10863.0                   2.0     11.844155         1.002689   

   Vehicle Age  Credit Score  Insurance Duration  Gender_Female  Gender_Male  \
0         19.0     592.92435                 1.0            1.0          0.0   
1         14.0     372.00000                 8.0            1.0          0.0   
2         16.0     819.00000                 9.0            1.0          0.0   
3          3.0     770.00000                 5.0            1.0          0.0   
4         14.0     755.00000                 7.0            0.0          1.0   

   Marital Status_Divorced  ...  Exercise Frequency_Monthly  \
0                

# Select a model and predict

In [48]:
y_pred_test = gradient_boosting_model.predict(X_test_encoded)

In [50]:
y_pred_test.min()

1018.0517365773499

In [51]:
y_pred_test.max()

2033.482880726243

In [52]:
df_submission = pd.DataFrame({'id': df_test['id'], 'Premium Amount': y_pred_test})
df_submission.to_csv('submission.csv', index=False)

In [55]:
# !kaggle competitions submit -c playground-series-s4e12 -f submission.csv -m "first submission"

'kaggle' is not recognized as an internal or external command,
operable program or batch file.
