<a href="https://colab.research.google.com/github/st3082group10/Abalone_Age_Prediction/blob/main/Abolone_Random_Forest_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Reading the cleaned numeric abolone data set
import pandas as pd
import numpy as np
 
# To remove the scientific notation from numpy arrays
np.set_printoptions(suppress=True)
 
from google.colab import files
uploaded = files.upload()
import io
from sklearn.preprocessing import LabelEncoder 
 
aboloneAgeDataNumeric = pd.read_csv(io.BytesIO(uploaded['abalone.csv']))

# Remove rows where height is equal to 0
aboloneAgeDataNumeric = aboloneAgeDataNumeric[(aboloneAgeDataNumeric['Height'] > 0) & (aboloneAgeDataNumeric['Height'] < 1)]

# Create an instance of LabelEncoder
le = LabelEncoder()

# Encode the "Sex" variable
aboloneAgeDataNumeric["Sex"] = le.fit_transform(aboloneAgeDataNumeric["Sex"])

aboloneAgeDataNumeric.head()


Saving abalone.csv to abalone.csv


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
# Separate Target Variable and Predictor Variables
TargetVariable=['Rings']
Predictors=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight','Shucked weight','Viscera weight', 'Shell weight']

X=aboloneAgeDataNumeric[Predictors].values
y=aboloneAgeDataNumeric[TargetVariable].values

# Set the random seed
np.random.seed(123)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Quick sanity check with the shapes of Training and testing datasets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3339, 8)
(3339, 1)
(835, 8)
(835, 1)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

y_train = y_train.ravel()
y_test = y_test.ravel()

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_depth': [8, 10, 12],
    'min_samples_split': [4, 6, 8, 10]
}

# Create a random forest model
rf = RandomForestRegressor(random_state=123)

# Perform a grid search with cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding validation score
print("Best parameters:", grid_search.best_params_)
print("Validation score:", -grid_search.best_score_)


Best parameters: {'max_depth': 8, 'min_samples_split': 10, 'n_estimators': 500}
Validation score: 4.8390489475984095


In [None]:
mx_depth = grid_search.best_params_['max_depth']
min_split = grid_search.best_params_['min_samples_split']
n_est = grid_search.best_params_['n_estimators']

In [None]:
# create a new random forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(n_estimators=n_est, max_depth=mx_depth, min_samples_split=min_split, random_state=123) 

# train the model on the entire training dataset
best_rf_model.fit(X_train, y_train)

# make predictions on the test dataset
y_train_pred = best_rf_model.predict(X_train)

# make predictions on the test dataset
y_test_pred = best_rf_model.predict(X_test)


In [None]:
TrainingData=pd.DataFrame(data=X_train, columns=Predictors)
TrainingData['Rings']=y_train

Train_Data_round=np.round(y_train_pred)
TrainingData['PredictedRings']=Train_Data_round

In [None]:
TestingData=pd.DataFrame(data=X_test, columns=Predictors)
TestingData['Rings']=y_test

Test_Data_round=np.round(y_test_pred)
TestingData['PredictedRings']=Test_Data_round

In [None]:
TestingData.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,PredictedRings
0,0.0,0.58,0.46,0.175,1.165,0.65,0.2205,0.3055,9,10.0
1,2.0,0.585,0.475,0.15,1.065,0.5315,0.199,0.2885,10,9.0
2,0.0,0.435,0.335,0.11,0.38,0.1695,0.086,0.11,9,9.0
3,2.0,0.605,0.455,0.16,1.1035,0.421,0.3015,0.325,9,13.0
4,0.0,0.35,0.275,0.065,0.205,0.0745,0.0465,0.07,10,8.0


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# calculate the Mean Squared Error(MSE) for the training and test sets
train_mse = mean_squared_error(TrainingData['Rings'],TrainingData['PredictedRings'])
test_mse = mean_squared_error(TestingData['Rings'],TestingData['PredictedRings'])

# Calculate the root mean squared error (RMSE)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# calculate the coefficient of determination (R squared) for the training and test sets
train_r2 = r2_score(TrainingData['Rings'],TrainingData['PredictedRings'])
test_r2 = r2_score(TestingData['Rings'],TestingData['PredictedRings'])

# Compute the MAPE for testing
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

test_mape = mape(TestingData['Rings'],TestingData['PredictedRings'])

# print the evaluation metrics
print('Train MSE: {:.3f}'.format(train_mse))
print('Test MSE: {:.3f}'.format(test_mse))
print('Train RMSE: {:.3f}'.format(train_rmse))
print('Test RMSE: {:.3f}'.format(test_rmse))
print('Train R^2: {:.3f}'.format(train_r2))
print('Test R^2: {:.3f}'.format(test_r2))
print('Testing MAPE: {:.3f}'.format(test_mape))

Train MSE: 3.028
Test MSE: 4.140
Train RMSE: 1.740
Test RMSE: 2.035
Train R^2: 0.715
Test R^2: 0.559
Testing MAPE: 14.403


In [None]:
# Computing the Absolute Percent Error(APE) for training set
Train_APE = 100 *(abs(TrainingData['Rings']-TrainingData['PredictedRings'])/TrainingData['Rings'])

# Computing the Absolute Percent Error(APE)for testing set
Test_APE=100*(abs(TestingData['Rings']-TestingData['PredictedRings'])/TestingData['Rings'])

print("The accuracy of the RF (training test) model is:",100-np.mean(Train_APE))
print('The accuracy of RF (testing set) model is:', 100-np.mean(Test_APE))

The accuracy of the RF (training test) model is: 87.9460715847427
The accuracy of RF (testing set) model is: 85.59663737403064


In [None]:
correlation = np.corrcoef(y_test,y_test_pred)[0, 1]
print(correlation)

0.7601629808983599
