In [1]:
import pandas as pd

# Load the dataset
fish_data = pd.read_csv('Fish.csv')

# Display the first few rows of the dataset and its summary
fish_data.head(), fish_data.describe(), fish_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


(  Species  Weight  Length1  Length2  Length3   Height   Width
 0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
 1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
 2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
 3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
 4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340,
             Weight     Length1     Length2     Length3      Height       Width
 count   159.000000  159.000000  159.000000  159.000000  159.000000  159.000000
 mean    398.326415   26.247170   28.415723   31.227044    8.970994    4.417486
 std     357.978317    9.996441   10.716328   11.610246    4.286208    1.685804
 min       0.000000    7.500000    8.400000    8.800000    1.728400    1.047600
 25%     120.000000   19.050000   21.000000   23.150000    5.944800    3.385650
 50%     273.000000   25.200000   27.300000   29.400000    7.786000    4.248500
 75%     650.000000   32.700000   35.500000   39.650000   12.

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Check the number of unique species
unique_species = fish_data['Species'].nunique()

# Preparing the data
X = fish_data.drop('Weight', axis=1)
y = fish_data['Weight']

# Encoding categorical data
categorical_features = ['Species']
numerical_features = X.columns.difference(categorical_features)

# One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)

unique_species, rmse


(7, 67.73176537816693)

In [3]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Display the RMSE value
print(f"Root Mean Square Error (RMSE): {rmse}")

# Display the first 5 actual vs predicted values
print("\nActual vs Predicted weights for the first 5 fish in the test set:")
for actual, predicted in zip(y_test[:5], y_pred[:5]):
    print(f"Actual: {actual}, Predicted: {predicted:.2f}")


Root Mean Square Error (RMSE): 67.73176537816693

Actual vs Predicted weights for the first 5 fish in the test set:
Actual: 78.0, Predicted: 84.09
Actual: 13.4, Predicted: 11.35
Actual: 200.0, Predicted: 308.15
Actual: 270.0, Predicted: 270.21
Actual: 150.0, Predicted: 139.40


In [4]:
import pickle

# Save the model to a file using pickle
with open('fish_weight_predictor.pkl', 'wb') as file:
    pickle.dump(model, file)
