In [1]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset with appropriate column names
column_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
df = pd.read_csv('abalone.csv', names=column_names)

In [3]:
# Print number of rows and columns
print(df.shape)

(4177, 9)


In [4]:
print(df.columns)

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')


In [5]:
# Print first five rows 
print(df.head())

  Sex  Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   Shell weight  Rings  
0         0.150     15  
1         0.070      7  
2         0.210      9  
3         0.155     10  
4         0.055      7  


In [6]:
# Print last five rows 
print(df.tail())

     Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
4172   F   0.565     0.450   0.165        0.8870          0.3700   
4173   M   0.590     0.440   0.135        0.9660          0.4390   
4174   M   0.600     0.475   0.205        1.1760          0.5255   
4175   F   0.625     0.485   0.150        1.0945          0.5310   
4176   M   0.710     0.555   0.195        1.9485          0.9455   

      Viscera weight  Shell weight  Rings  
4172          0.2390        0.2490     11  
4173          0.2145        0.2605     10  
4174          0.2875        0.3080      9  
4175          0.2610        0.2960     10  
4176          0.3765        0.4950     12  


In [7]:
# Display basic information about the dataset
print("Dataset Information:")
print(df.info())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB
None


In [8]:
# Count duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()

# Print the count of duplicate rows
print("Duplicate Rows:", duplicate_count)

Duplicate Rows: 0


In [9]:
# Check for missing values after cleaning.
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64


In [10]:
# Check for missing values
missing_values = df.isnull().sum()
if missing_values.any():
    df.dropna(inplace=True)

In [11]:
# Check for abnormal values (e.g., "3+")
abnormal_values = df[df.applymap(lambda x: isinstance(x, str) and '+' in x)].count().sum()
if abnormal_values > 0:
    df.replace(to_replace=r'(\d+)\+', value=r'\1', regex=True, inplace=True)

In [12]:
# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_abalone.csv', index=False)

# Modelling

In [13]:
# Split the dataset into features (X) and target (y)
X = df.drop('Rings', axis=1)
y = df['Rings']

In [14]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# Define the expected feature order to ensure that during predicting, the same order is used as during fitting
feature_order = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Sex_I', 'Sex_M']

In [16]:
# One-hot encode the 'Sex' column in X_train and X_test with specific column names
encoded_X_train = pd.get_dummies(X_train, columns=['Sex'], drop_first=True)
encoded_X_test = pd.get_dummies(X_test, columns=['Sex'], drop_first=True)

In [17]:
# Reindex the columns to ensure feature order consistency
encoded_X_train = encoded_X_train.reindex(columns=feature_order, fill_value=0)
encoded_X_test = encoded_X_test.reindex(columns=feature_order, fill_value=0)

In [18]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(encoded_X_train)
X_test_scaled = scaler.transform(encoded_X_test)

# LINEAR REGRESSION

In [19]:
# Select a suitable ML model (Linear Regression)
model = LinearRegression()

In [20]:
# Train the model
model.fit(X_train_scaled, y_train)

In [21]:
# Evaluate the model on the test set
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 4.891232447128581


In [22]:
# Calculate accuracy
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5481628137889262


In [23]:
# Calculate R-squared (coefficient of determination)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (Coefficient of Determination): {r2}")

R-squared (Coefficient of Determination): 0.5481628137889262


# DEFINING NEW DATA RECORDS

In [31]:
# Create two new records with reasonable values
new_predicts = pd.DataFrame({
    'Sex': ['M', 'F'],
    'Length': [0.35, 0.65],
    'Diameter': [0.75, 0.6],
    'Height': [0.25, 0.4],
    'Whole weight': [0.4, 0.35],
    'Shucked weight': [0.15, 0.2],
    'Viscera weight': [0.08, 0.1],
    'Shell weight': [0.1, 0.12]
})

# ENCODING NEW DATA

In [37]:
# One-hot encode the 'Sex' column in new_predicts with specific column names
new_predicts_encoded = pd.get_dummies(new_predicts, columns=['Sex'], drop_first=True)

In [38]:
# Reindex the columns to ensure feature order consistency
new_predicts_encoded = new_predicts_encoded.reindex(columns=feature_order, fill_value=0)

# Predict the age of the abalones for the new records
new_predicts_scaled = scaler.transform(new_predicts_encoded)
predicted_age = model.predict(new_predicts_scaled)

# PREDICTING

In [39]:
# Print predicted age for new records
print("\nPredicted Age for New Records:")
for age in predicted_age:
    print(f"{age:.2f} years")


Predicted Age for New Records:
15.38 years
13.64 years


# K-CROSS VALIDATION.

K=10

In [40]:
# Evaluate the model using 10-fold cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=10)
print("Cross-Validation Scores:")
print(cv_scores)
print(f"Average Cross-Validation Score: {cv_scores.mean()}")


Cross-Validation Scores:
[0.48875933 0.4856133  0.52849386 0.53669465 0.52016077 0.43305257
 0.58719168 0.59212239 0.5532167  0.45149025]
Average Cross-Validation Score: 0.5176795511571957


In [None]:
# Reverse transform the predicted clusters to human-readable form
predicted_income_readable = encoders[col].inverse_transform(predicted_income)
print("\nPredicted Clusters for New Records (Human-Readable):")
print(predicted_income_readable)