In [1]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [2]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

def score_dataset(X_train, X_valid, y_train, y_valid):
    # Function for comparing different approaches
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

def get_score(n_estimators):
    """Return the average MAE over 3 CV folds of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """
    my_pipeline = Pipeline(steps=[
        ('preprocessor', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0))
    ])
    scores = -1 * cross_val_score(my_pipeline, X_train, y_train,
                              cv=3,
                              scoring='neg_mean_absolute_error')
    return scores.mean()


In [3]:
#Read data
# Read the data
X_full = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv', index_col='id')
X_test_full = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv', index_col='id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Rings'], inplace=True)
y = X_full.Rings
X_full.drop(['Rings'], axis=1, inplace=True)

X_test_full

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
90615,M,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
90616,M,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
90617,M,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
90618,M,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
90619,I,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...
151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [4]:
#Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [5]:
X_train.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
38059,I,0.58,0.4,0.145,0.8915,0.3915,0.182,0.2265
26114,M,0.525,0.405,0.115,0.7095,0.3155,0.1455,0.175
19998,F,0.625,0.475,0.16,1.0185,0.4775,0.2255,0.2635
18162,F,0.575,0.45,0.175,1.0225,0.3675,0.2155,0.34
56716,F,0.57,0.46,0.165,0.827,0.2985,0.226,0.235


In [6]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column)

(72492, 8)
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
dtype: int64


In [7]:
model_XGB = XGBRegressor(random_state=0)


In [8]:
# Create the pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('model', model_XGB)
])


<h1>Train a model for the competion</h1>

In [9]:
X_test_full

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
90615,M,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
90616,M,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
90617,M,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
90618,M,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
90619,I,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...
151021,I,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
151022,F,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
151023,I,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
151024,F,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [10]:
final_pipeline.fit(X_full, y)
# 2. Select numeric columns from the test data
test_preds = final_pipeline.predict(X_test_full)
# 3. Make predictions on the test data
# 4. Create the output DataFrame for submission
output = pd.DataFrame({'id': X_test_full.index, 
                       'Rings': test_preds})

# 5. Save the predictions to a CSV file for submission
output.to_csv('submission.csv', index=False)

print("Predictions saved to 'submission.csv'")

Predictions saved to 'submission.csv'
