## Initialization

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/My Drive/abalone.csv')

## Exploration

In [None]:
data.describe(include="all")

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
unique,3,,,,,,,,
top,M,,,,,,,,
freq,1528,,,,,,,,
mean,,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0


The rings around an abalone's shell are used to determine its age: more rings means the abalone is older. (Age is what we're trying to predict with this model.)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [None]:
data.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

## Preprocessing

In [None]:
y = data.loc[:, 'Rings']
# learner view:
# y = what_goes_here?
y

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: Rings, Length: 4177, dtype: int64

In [None]:
features = data.iloc[:, :-1]
# features = what_goes_here?
features

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


In [None]:
X = pd.get_dummies(features)
# X = what_goes_here?
X

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,0,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,0,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,1,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,0,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,1,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,0,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,0,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,1,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=3)
X_train = X_train.values
print(len(X_train), len(y_train))
print(len(X_test),len(y_test))

2923 2923
1254 1254


## Training

In [None]:
rf_model = RandomForestRegressor(random_state=1)
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [None]:
rf_model.fit(X_train, y_train)
# learner view:
# rf_model.what_goes_here?(X_train, y_train)

RandomForestRegressor(random_state=1)

In [None]:
y_pred = rf_model.predict(X_test.values)
# learner view:
# y_pred = rf_model.predict(what_goes_here?)
y_pred

array([10.5 , 12.02, 11.3 , ...,  5.64, 10.43, 13.44])

## Testing

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

MSE: 4.621311084529506


Since MSE is not as easily interpreted as mean accuracy, let's compare the model's output to the results on a couple test examples.

In [None]:
from random import seed
from random import randint

seed(5)

def sample_predictions(model):
  for _ in range(5):
    random_example_index = randint(0, len(y_test) -1)
    random_example = X_test.iloc[random_example_index,:]
    random_label = y_test.iloc[random_example_index]
    random_prediction = model.predict([random_example])
    print(f"Example {random_example_index}: The abalone's actual ring count is {random_label}, predicted ring count is {random_prediction[0]}.")
  model_predictions = model.predict(X_test.values)
  print(f"MSE for this model is {mean_squared_error(y_test, model_predictions)}" )

In [None]:
sample_predictions(rf_model)

Example 523: The abalone's actual ring count is 13, predicted ring count is 14.72
Example 734: The abalone's actual ring count is 8, predicted ring count is 10.36
Example 1085: The abalone's actual ring count is 12, predicted ring count is 10.32
Example 59: The abalone's actual ring count is 14, predicted ring count is 10.2
Example 953: The abalone's actual ring count is 10, predicted ring count is 15.12
MSE for this model is 4.621311084529506


## Deployment

We've written enough 'deployment' print statements by now; let's try out some hyperparameter tuning in the "Iteration" section.

## Iteration

Let's try out a few different hyperparameter settings and see which version of the model performs best.

In [None]:
alternative_rf_model_1 = RandomForestRegressor(random_state=1, max_features=0.3)
# learner view:
# alternative_rf_model_1 = RandomForestRegressor(random_state=1, max_features=what_goes_here?)
alternative_rf_model_1.fit(X_train,y_train)
sample_predictions(alternative_rf_model_1)

Example 510: The abalone's actual ring count is 9, predicted ring count is 10.48
Example 106: The abalone's actual ring count is 10, predicted ring count is 11.77
Example 321: The abalone's actual ring count is 14, predicted ring count is 14.12
Example 231: The abalone's actual ring count is 10, predicted ring count is 9.89
Example 761: The abalone's actual ring count is 4, predicted ring count is 4.43
MSE for this model is 4.409356140350877


In [None]:
alternative_rf_model_2 = RandomForestRegressor(random_state=1, max_features=0.3, n_estimators=200)
alternative_rf_model_2.fit(X_train, y_train)
sample_predictions(alternative_rf_model_2)

Example 572: The abalone's actual ring count is 12, predicted ring count is 15.19
Example 372: The abalone's actual ring count is 16, predicted ring count is 12.95
Example 797: The abalone's actual ring count is 11, predicted ring count is 11.99
Example 326: The abalone's actual ring count is 8, predicted ring count is 9.15
Example 147: The abalone's actual ring count is 10, predicted ring count is 10.45
MSE for this model is 4.401135446570973


In [None]:
alternative_rf_model_3 = RandomForestRegressor(random_state=1, max_features=0.3, n_estimators=500)
alternative_rf_model_3.fit(X_train,y_train)
sample_predictions(alternative_rf_model_3)

Example 284: The abalone's actual ring count is 9, predicted ring count is 9.99
Example 911: The abalone's actual ring count is 10, predicted ring count is 10.868
Example 259: The abalone's actual ring count is 21, predicted ring count is 12.706
Example 270: The abalone's actual ring count is 11, predicted ring count is 10.624
Example 3: The abalone's actual ring count is 9, predicted ring count is 10.696
MSE for this model is 4.400594165869218
