# 1. Introduction

  Embark on a data-driven odyssey as we unravel the mystery of abalone age prediction. Through advanced machine learning techniques, we delve into the depths of physical attributes to accurately estimate the age of these enigmatic marine creatures. Join us in this voyage of discovery and innovation as we navigate through the intricate web of data to unlock the secrets hidden within the shells of abalones.

# 2. Loading Dataset

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
from pathlib import Path
from zipfile import ZipFile

file_path = "/content/drive/MyDrive/Abalone_Dataset.zip"

data_path = Path("data/")

if data_path.is_dir():
  print("Skipp")

else:
  data_path.mkdir(exist_ok=True, parents=True)

  with ZipFile(file_path, "r") as zip_file:
    zip_file.extractall(data_path)
    print("Done!!!")

Done!!!


# 3. Exploring data and EDA

In [3]:
import pandas as pd

train_data = pd.read_csv("/content/data/train.csv")
test_data = pd.read_csv("/content/data/test.csv")
ss = pd.read_csv("/content/data/sample_submission.csv")

In [4]:
# We will only see how is our data since this is test data we will not explore more
test_data.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,90616,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,90617,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,90619,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [5]:
train_data.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [6]:
train_data.shape

(90615, 10)

In [7]:
train_data.isnull().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
Rings             0
dtype: int64

In [8]:
train_data.drop(["id"], axis=1, inplace=True)

In [9]:
train_data

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


In [10]:
pd.unique(train_data.Sex)

array(['F', 'I', 'M'], dtype=object)

In [11]:
train_data.Sex.value_counts()

Sex
I    33093
M    31027
F    26495
Name: count, dtype: int64

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             90615 non-null  object 
 1   Length          90615 non-null  float64
 2   Diameter        90615 non-null  float64
 3   Height          90615 non-null  float64
 4   Whole weight    90615 non-null  float64
 5   Whole weight.1  90615 non-null  float64
 6   Whole weight.2  90615 non-null  float64
 7   Shell weight    90615 non-null  float64
 8   Rings           90615 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.2+ MB


In [13]:
# Replace values in the 'Sex' column using a dictionary
train_data["Sex"] = train_data["Sex"].replace({'I': 2, 'M': 0, 'F': 1})

In [14]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             90615 non-null  int64  
 1   Length          90615 non-null  float64
 2   Diameter        90615 non-null  float64
 3   Height          90615 non-null  float64
 4   Whole weight    90615 non-null  float64
 5   Whole weight.1  90615 non-null  float64
 6   Whole weight.2  90615 non-null  float64
 7   Shell weight    90615 non-null  float64
 8   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 6.2 MB


In [15]:
train_data.isnull().sum()

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
Rings             0
dtype: int64

In [16]:
train_data.shape

(90615, 9)

# 4. Splitting data into train and eval

In [17]:
X = train_data.drop("Rings", axis=1)
y = train_data["Rings"]

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                    test_size=0.2)

In [19]:
len(X_train), len(X_test), len(y_train), len(y_test)

(72492, 18123, 72492, 18123)

# 5. Selecting Model and evaluation metrics

Since mentioned in competition to use  Root Mean Squared Logarithmic Error.
RMSE

In [20]:
import numpy as np

from sklearn.metrics import mean_squared_log_error

def rmsle(y_true, y_pred):
  msle = mean_squared_log_error(y_true, y_pred)
  rmsle = np.sqrt(msle)
  return rmsle


In [21]:
X_train.shape

(72492, 8)

In [27]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet

models = {
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "Xgboost": XGBRegressor(),
    "ElasticNet": ElasticNet(),
}



In [29]:
def fit_and_pred(X_train, X_test, y_train, y_test, model):
  results = {}

  for name, model in model.items():
     # Train the model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    score = rmsle(y_test, y_pred)

    results[name] = {"Score": score}

  return results


In [30]:
results = fit_and_pred(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, model=models)

In [31]:
results

{'Lasso': {'Score': 0.2835872355446238},
 'RandomForest': {'Score': 0.15582447001408908},
 'GradientBoosting': {'Score': 0.1556113639917533},
 'Xgboost': {'Score': 0.15267211089766233},
 'ElasticNet': {'Score': 0.26380845533410946}}

# 6. Hyperparameter tuning

**Note:-** I focused on hyperparameter tuning for XGBoost to maximize predictive performance within the constraints of limited compute resources, demonstrating efficient model optimization to potential employers.

In [32]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the XGBoost regressor
xgb = XGBRegressor()

# Define the RMSLE scoring function
def rmsle_scorer(y_true, y_pred):
    msle = mean_squared_log_error(y_true, y_pred)
    rmsle = np.sqrt(msle)
    return -rmsle

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Define the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid,
                                   n_iter=10, scoring=rmsle_scorer, cv=5,
                                   verbose=2, random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [35]:
# Get the best estimator and its parameters
best_estimator = random_search.best_estimator_
best_params = random_search.best_params_

# Print the best parameters
print("Best Parameters:")
print(best_params)

# Evaluate the best estimator on the test set
y_pred = best_estimator.predict(X_test)
rmsle_score = rmsle(y_test, y_pred)
print("RMSLE Score on Test Set:", rmsle_score)


Best Parameters:
{'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
RMSLE Score on Test Set: 0.15284182816662664


In [39]:
model = XGBRegressor(subsample=1.0,
                     n_estimators=200,
                     min_child_weight=5,
                     max_depth=4,
                     learning_rate=0.1,
                     gamma=0,
                     colsample_bytree=0.8)

model.fit(X_train, y_train)

In [40]:
y_pred = model.predict(X_train)

In [41]:
rmsle(y_train, y_pred)

0.14690661413669526

In [42]:
y_test_pred = model.predict(X_test)

In [43]:
rmsle(y_test, y_test_pred)

0.15284182816662664

# 7. Testing model on test data

In [44]:
test_data.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,90616,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,90617,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,90619,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [46]:
pd.unique(test_data.Sex)

array(['M', 'I', 'F'], dtype=object)

In [48]:
# Replace values in the 'Sex' column using a dictionary
test_data["Sex"] = test_data["Sex"].replace({'I': 2, 'M': 0, 'F': 1})

In [49]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60411 entries, 0 to 60410
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              60411 non-null  int64  
 1   Sex             60411 non-null  int64  
 2   Length          60411 non-null  float64
 3   Diameter        60411 non-null  float64
 4   Height          60411 non-null  float64
 5   Whole weight    60411 non-null  float64
 6   Whole weight.1  60411 non-null  float64
 7   Whole weight.2  60411 non-null  float64
 8   Shell weight    60411 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 4.1 MB


In [50]:
ss.head()

Unnamed: 0,id,Rings
0,90615,10
1,90616,10
2,90617,10
3,90618,10
4,90619,10


In [52]:
test_data_id = test_data.id

In [53]:
test_data_id

0         90615
1         90616
2         90617
3         90618
4         90619
          ...  
60406    151021
60407    151022
60408    151023
60409    151024
60410    151025
Name: id, Length: 60411, dtype: int64

In [54]:
test_data.drop(["id"], axis=1, inplace=True)

In [62]:
test_pred = model.predict(test_data)

In [63]:
test_pred

array([ 9.577033,  9.723614, 10.259902, ..., 12.056011, 13.594163,
        8.550706], dtype=float32)

In [66]:
import numpy as np

test_pred_round = np.round(test_pred).astype(int)

In [67]:
test_pred_round

array([10, 10, 10, ..., 12, 14,  9])

In [68]:
# making our data into submission file

submission_file = pd.DataFrame({"id": test_data_id, "Rings": test_pred_round})

In [72]:
submission_file

Unnamed: 0,id,Rings
0,90615,10
1,90616,10
2,90617,10
3,90618,10
4,90619,8
...,...,...
60406,151021,6
60407,151022,10
60408,151023,12
60409,151024,14


In [71]:
submission_file.to_csv('submission_file.csv', index=False)