<a href="https://www.kaggle.com/code/hikageshinomori/regression-with-abalone-datas?scriptVersionId=173446970" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from datetime import datetime

# Load the data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

# Label Encoding
label_encoder = LabelEncoder()

# Apply label encoding to all columns
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column] = label_encoder.fit_transform(train_data[column])
        test_data[column] = label_encoder.transform(test_data[column])

# Split features and target
X = train_data.drop(['id', 'Rings'], axis=1)
y = train_data['Rings']

# Feature Engineering
X['Volume'] = X['Length'] * X['Diameter'] * X['Height']
test_data['Volume'] = test_data['Length'] * test_data['Diameter'] * test_data['Height']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data.drop(['id'], axis=1))

# Splitting data for cross-validation manually
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the TensorFlow model
model = Sequential([
    Conv1D(128, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    
    Dense(1024, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(1, activation='linear')
])

# Compile the model
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])

# Define callbacks
callbacks = [
    EarlyStopping(patience=30),
    ReduceLROnPlateau(factor=0.5, patience=10)
]



2024-04-23 02:49:35.112109: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 02:49:35.112220: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 02:49:35.277200: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(


In [2]:
# Train the model
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(X_train.reshape(-1, X_train.shape[1], 1), y_train, 
                    validation_data=(X_val.reshape(-1, X_val.shape[1], 1), y_val), 
                    epochs=150, batch_size=128, callbacks=[callbacks, tensorboard_callback], verbose=1)


Epoch 1/150
[1m 13/567[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 4ms/step - loss: 102.4167 - mae: 9.5873     

I0000 00:00:1713840599.803578      87 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1713840599.823834      87 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 78.8801 - mae: 8.2594

W0000 00:00:1713840609.655929      88 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1713840610.250131      90 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - loss: 78.8357 - mae: 8.2561 - val_loss: 6.8834 - val_mae: 1.7992 - learning_rate: 5.0000e-04
Epoch 2/150
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 12.1421 - mae: 2.5902 - val_loss: 4.2383 - val_mae: 1.3709 - learning_rate: 5.0000e-04
Epoch 3/150
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 8.5972 - mae: 2.1373 - val_loss: 3.8379 - val_mae: 1.2922 - learning_rate: 5.0000e-04
Epoch 4/150
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 7.2816 - mae: 1.9626 - val_loss: 3.8397 - val_mae: 1.2847 - learning_rate: 5.0000e-04
Epoch 5/150
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 6.3390 - mae: 1.8200 - val_loss: 3.9276 - val_mae: 1.2819 - learning_rate: 5.0000e-04
Epoch 6/150
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 5.8852 -

W0000 00:00:1713841030.484450      87 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m1888/1888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
TensorFlow RMSE: 1.8856626500965983


In [3]:

# Predict with the TensorFlow model
tensorflow_preds = model.predict(test_data_scaled.reshape(-1, test_data_scaled.shape[1], 1)).flatten()

# Calculate RMSE for TensorFlow
tensorflow_rmse = np.sqrt(mean_squared_error(y_val, model.predict(X_val.reshape(-1, X_val.shape[1], 1)).flatten()))
print(f'TensorFlow RMSE: {tensorflow_rmse}')

# Create submission file
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Rings': tensorflow_preds
})

# Save submission file
submission_df.to_csv('submission_tensorflow_cross_val.csv', index=False)

[1m1888/1888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
TensorFlow RMSE: 1.8856626500965983


Above gave 0.15018 score

In [4]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Conv1D, MaxPooling1D, Flatten, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from datetime import datetime

# Load the data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

# Label Encoding
label_encoder = LabelEncoder()

# Apply label encoding to all columns
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column] = label_encoder.fit_transform(train_data[column])
        test_data[column] = label_encoder.transform(test_data[column])

# Split features and target
X = train_data.drop(['id', 'Rings'], axis=1)
y = train_data['Rings']

# Feature Engineering
X['Volume'] = X['Length'] * X['Diameter'] * X['Height']
test_data['Volume'] = test_data['Length'] * test_data['Diameter'] * test_data['Height']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data.drop(['id'], axis=1))

# Splitting data for cross-validation manually
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the TensorFlow model with improved architecture
model = Sequential([
    Conv1D(256, kernel_size=5, activation='linear', input_shape=(X_train.shape[1], 1)),
    LeakyReLU(alpha=0.3),
    BatchNormalization(),
    
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    
    Conv1D(128, kernel_size=3, activation='linear'),
    LeakyReLU(alpha=0.3),
    BatchNormalization(),
    
    LSTM(64),
    Dropout(0.5),
    
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(1, activation='linear')
])

# Compile the model
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])

# Define callbacks
callbacks = [
    EarlyStopping(patience=30),
    ReduceLROnPlateau(factor=0.5, patience=10)
]

# Train the model
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(X_train.reshape(-1, X_train.shape[1], 1), y_train, 
                    validation_data=(X_val.reshape(-1, X_val.shape[1], 1), y_val), 
                    epochs=100, batch_size=128, callbacks=[callbacks, tensorboard_callback], verbose=1)


  super().__init__(


Epoch 1/100
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - loss: 79.9704 - mae: 8.2297 - val_loss: 9.9710 - val_mae: 2.3546 - learning_rate: 5.0000e-04
Epoch 2/100
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 12.9068 - mae: 2.7252 - val_loss: 4.4652 - val_mae: 1.3801 - learning_rate: 5.0000e-04
Epoch 3/100
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - loss: 8.9253 - mae: 2.2183 - val_loss: 4.0099 - val_mae: 1.3286 - learning_rate: 5.0000e-04
Epoch 4/100
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - loss: 7.3854 - mae: 1.9844 - val_loss: 3.9510 - val_mae: 1.3110 - learning_rate: 5.0000e-04
Epoch 5/100
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - loss: 6.3670 - mae: 1.8286 - val_loss: 3.9333 - val_mae: 1.3138 - learning_rate: 5.0000e-04
Epoch 6/100
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step

KeyboardInterrupt: 

In [5]:
tensorflow_preds = model.predict(test_data_scaled.reshape(-1, test_data_scaled.shape[1], 1)).flatten()

# Calculate RMSE for TensorFlow
tensorflow_rmse = np.sqrt(mean_squared_error(y_val, model.predict(X_val.reshape(-1, X_val.shape[1], 1)).flatten()))
print(f'TensorFlow RMSE: {tensorflow_rmse}')

# Create submission file
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Rings': tensorflow_preds
})

# Save submission file
submission_df.to_csv('submission_tensorflow_cross_val_improved.csv', index=False)

[1m1888/1888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step
[1m567/567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
TensorFlow RMSE: 1.8963664709892039


In [6]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from datetime import datetime

# Load the data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

# Label Encoding
label_encoder = LabelEncoder()

# Apply label encoding to all columns
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column] = label_encoder.fit_transform(train_data[column])
        test_data[column] = label_encoder.transform(test_data[column])

# Split features and target
X = train_data.drop(['id', 'Rings'], axis=1)
y = train_data['Rings']

# Feature Engineering
X['Volume'] = X['Length'] * X['Diameter'] * X['Height']
test_data['Volume'] = test_data['Length'] * test_data['Diameter'] * test_data['Height']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data.drop(['id'], axis=1))

# Splitting data for custom cross-validation manually
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize CatBoost model
catboost_model = cb.CatBoostRegressor(iterations=1500,
                                      learning_rate=0.03,
                                      depth=12,
                                      loss_function='RMSE',
                                      verbose=100)

# Train CatBoost model
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# Predict with CatBoost model
catboost_preds = catboost_model.predict(test_data_scaled)

# Calculate RMSE for CatBoost
catboost_rmse = np.sqrt(mean_squared_error(y_val, catboost_model.predict(X_val)))
print(f'CatBoost RMSE: {catboost_rmse}')

# Create submission file
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Rings': catboost_preds
})

# Save submission file
submission_df.to_csv('submission_catboost_cross_val.csv', index=False)

0:	learn: 3.1174516	test: 3.1536600	best: 3.1536600 (0)	total: 132ms	remaining: 3m 18s
100:	learn: 1.8925154	test: 1.9494062	best: 1.9494062 (100)	total: 5.55s	remaining: 1m 16s
200:	learn: 1.8220066	test: 1.9076746	best: 1.9076746 (200)	total: 11.1s	remaining: 1m 11s
300:	learn: 1.7777243	test: 1.8929567	best: 1.8929567 (300)	total: 16.6s	remaining: 1m 6s
400:	learn: 1.7390345	test: 1.8858630	best: 1.8858630 (400)	total: 22.1s	remaining: 1m
500:	learn: 1.7032320	test: 1.8809483	best: 1.8809483 (500)	total: 27.6s	remaining: 55.1s
600:	learn: 1.6688656	test: 1.8777279	best: 1.8777279 (600)	total: 33.5s	remaining: 50.1s
700:	learn: 1.6380580	test: 1.8753931	best: 1.8753931 (700)	total: 39.1s	remaining: 44.5s
800:	learn: 1.6102367	test: 1.8736635	best: 1.8736635 (800)	total: 44.6s	remaining: 38.9s
900:	learn: 1.5863519	test: 1.8726506	best: 1.8726142 (897)	total: 50.1s	remaining: 33.3s
1000:	learn: 1.5624815	test: 1.8710664	best: 1.8710664 (1000)	total: 55.6s	remaining: 27.7s
1100:	learn:

In [7]:
submission_df['Rings'] = submission_df['Rings'].round().astype(int)

# Save submission file
submission_df.to_csv('submission_catboost_cross_val_rounded.csv', index=False)

In [8]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from datetime import datetime

# Load the data
train_data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

# Label Encoding
label_encoder = LabelEncoder()

# Apply label encoding to all columns
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column] = label_encoder.fit_transform(train_data[column])
        test_data[column] = label_encoder.transform(test_data[column])

# Split features and target
X = train_data.drop(['id', 'Rings'], axis=1)
y = train_data['Rings']

# Feature Engineering
X['Volume'] = X['Length'] * X['Diameter'] * X['Height']
test_data['Volume'] = test_data['Length'] * test_data['Diameter'] * test_data['Height']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data.drop(['id'], axis=1))

# Splitting data for custom cross-validation manually
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize CatBoost model with optimized hyperparameters
catboost_model = cb.CatBoostRegressor(iterations=3000,
                                      learning_rate=0.02,
                                      depth=10,
                                      loss_function='RMSE',
                                      l2_leaf_reg=3,
                                      od_type='Iter',
                                      od_wait=200,
                                      verbose=200)

# Train CatBoost model
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)

# Predict with CatBoost model
catboost_preds = catboost_model.predict(test_data_scaled)

# Calculate RMSE for CatBoost
catboost_rmse = np.sqrt(mean_squared_error(y_val, catboost_model.predict(X_val)))
print(f'CatBoost RMSE: {catboost_rmse}')

# Create submission file
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Rings': catboost_preds
})

# Save submission file
submission_df.to_csv('submission_catboost_improved.csv', index=False)


0:	learn: 3.1345438	test: 3.1707063	best: 3.1707063 (0)	total: 25.8ms	remaining: 1m 17s
200:	learn: 1.8800356	test: 1.9392677	best: 1.9392677 (200)	total: 4.79s	remaining: 1m 6s
400:	learn: 1.8222286	test: 1.9043863	best: 1.9043863 (400)	total: 9.46s	remaining: 1m 1s
600:	learn: 1.7826420	test: 1.8909686	best: 1.8909686 (600)	total: 14.3s	remaining: 57.1s
800:	learn: 1.7489537	test: 1.8820964	best: 1.8820964 (800)	total: 19.4s	remaining: 53.2s
1000:	learn: 1.7164899	test: 1.8758082	best: 1.8758082 (1000)	total: 24.2s	remaining: 48.3s
1200:	learn: 1.6899979	test: 1.8712128	best: 1.8712128 (1200)	total: 28.9s	remaining: 43.3s
1400:	learn: 1.6651400	test: 1.8674098	best: 1.8673942 (1398)	total: 33.7s	remaining: 38.5s
1600:	learn: 1.6420368	test: 1.8643037	best: 1.8643037 (1600)	total: 38.5s	remaining: 33.6s
1800:	learn: 1.6208511	test: 1.8626963	best: 1.8626963 (1800)	total: 43.3s	remaining: 28.8s
2000:	learn: 1.6016316	test: 1.8615710	best: 1.8615710 (2000)	total: 48.1s	remaining: 24s
22

above gave best score 