In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score, r2_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset with appropriate column names
column_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
df = pd.read_csv('abalone.csv', names=column_names)

In [3]:
print(df.columns)

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')


In [4]:
print(df.head())

  Sex  Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   Shell weight  Rings  
0         0.150     15  
1         0.070      7  
2         0.210      9  
3         0.155     10  
4         0.055      7  


In [5]:
print(df.tail())

     Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
4172   F   0.565     0.450   0.165        0.8870          0.3700   
4173   M   0.590     0.440   0.135        0.9660          0.4390   
4174   M   0.600     0.475   0.205        1.1760          0.5255   
4175   F   0.625     0.485   0.150        1.0945          0.5310   
4176   M   0.710     0.555   0.195        1.9485          0.9455   

      Viscera weight  Shell weight  Rings  
4172          0.2390        0.2490     11  
4173          0.2145        0.2605     10  
4174          0.2875        0.3080      9  
4175          0.2610        0.2960     10  
4176          0.3765        0.4950     12  


In [6]:
# Check for missing values
missing_values = df.isnull().sum()
if missing_values.any():
    df.dropna(inplace=True)


In [7]:
# Check for abnormal values (e.g., "3+")
abnormal_values = df[df.applymap(lambda x: isinstance(x, str) and '+' in x)].count().sum()
if abnormal_values > 0:
    df.replace(to_replace=r'(\d+)\+', value=r'\1', regex=True, inplace=True)


In [8]:
# Check the columns in the DataFrame
print(df.columns)

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')


In [9]:
# Save the cleaned dataset to a new CSV file
df.to_csv('abalone_cleaned.csv', index=False)

In [68]:
# Split the dataset into features (X) and target (y)
X = df.drop('Rings', axis=1)
y = df['Rings']

In [69]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
# Define the expected feature order
expected_order = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Sex_I', 'Sex_M']

In [71]:
# One-hot encode the 'Sex' column in X_train and X_test with specific column names
X_train_encoded = pd.get_dummies(X_train, columns=['Sex'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=['Sex'], drop_first=True)

In [72]:
# Reindex the columns to ensure feature order consistency
X_train_encoded = X_train_encoded.reindex(columns=expected_order, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=expected_order, fill_value=0)

In [73]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [74]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [75]:
# Convert scaled arrays back to DataFrames for readability
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=expected_order)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=expected_order)

In [77]:
# Print feature names after scaling
print("\nFeature names after scaling for X_train:")
print(X_train_scaled_df.columns)


Feature names after scaling for X_train:
Index(['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Sex_I', 'Sex_M'],
      dtype='object')


In [78]:
print("\nFeature names after scaling for X_test:")
print(X_test_scaled_df.columns)


Feature names after scaling for X_test:
Index(['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Sex_I', 'Sex_M'],
      dtype='object')


In [79]:
# Select a suitable ML model (Linear Regression)
model = LinearRegression()

In [80]:
# Train the model
model.fit(X_train_scaled, y_train)

In [81]:
# Evaluate the model on the test set
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 4.891232447128581


In [82]:
# Calculate accuracy
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5481628137889262


In [83]:
# Calculate R-squared (coefficient of determination)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (Coefficient of Determination): {r2}")

R-squared (Coefficient of Determination): 0.5481628137889262


In [84]:
# Create two new records with reasonable values
new_records = pd.DataFrame({
    'Sex': ['M', 'F'],
    'Length': [0.55, 0.6],
    'Diameter': [0.45, 0.5],
    'Height': [0.15, 0.2],
    'Whole weight': [0.35, 0.4],
    'Shucked weight': [0.15, 0.2],
    'Viscera weight': [0.08, 0.1],
    'Shell weight': [0.1, 0.12]
})

In [85]:
# One-hot encode the 'Sex' column in new_records with specific column names
new_records_encoded = pd.get_dummies(new_records, columns=['Sex'], drop_first=True)

In [86]:
# Reindex the columns to ensure feature order consistency
new_records_encoded = new_records_encoded.reindex(columns=expected_order, fill_value=0)

# Predict the age of the abalones for the new records
new_records_scaled = scaler.transform(new_records_encoded)
predicted_age = model.predict(new_records_scaled)

In [87]:
# Print predicted age for new records
print("\nPredicted Age for New Records:")
for age in predicted_age:
    print(f"{age:.2f} years")


Predicted Age for New Records:
10.51 years
10.89 years


In [89]:
# Evaluate the model using 10-fold cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=10)
print("Cross-Validation Scores:")
print(cv_scores)
print(f"Average Cross-Validation Score: {cv_scores.mean()}")


Cross-Validation Scores:
[0.48875933 0.4856133  0.52849386 0.53669465 0.52016077 0.43305257
 0.58719168 0.59212239 0.5532167  0.45149025]
Average Cross-Validation Score: 0.5176795511571957
