In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, LogisticRegression

In [2]:
data = pd.read_csv('abalone.csv')

In [3]:
data

Unnamed: 0,sex,length,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,rings
0,M,91,73,19,102.8,44.9,20.2,30.0,15
1,M,70,53,18,45.1,19.9,9.7,14.0,7
2,F,106,84,27,135.4,51.3,28.3,42.0,9
3,M,88,73,25,103.2,43.1,22.8,31.0,10
4,I,66,51,16,41.0,17.9,7.9,11.0,7
...,...,...,...,...,...,...,...,...,...
4172,F,113,90,33,177.4,74.0,47.8,49.8,11
4173,M,118,88,27,193.2,87.8,42.9,52.1,10
4174,M,120,95,41,235.2,105.1,57.5,61.6,9
4175,F,125,97,30,218.9,106.2,52.2,59.2,10


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   int64  
 2   diameter        4177 non-null   int64  
 3   height          4177 non-null   int64  
 4   whole-weight    4177 non-null   float64
 5   shucked-weight  4177 non-null   float64
 6   viscera-weight  4177 non-null   float64
 7   shell-weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 293.8+ KB


Preprocessing + Training Function

In [5]:
def preprocess_and_train(df, target, task):
    df = df.copy()
    
    # If the sex column is not the target, one-hot encode it
    if target != 'sex':
        dummies = pd.get_dummies(df['sex'])
        df = pd.concat([df, dummies], axis=1)
        df = df.drop('sex', axis=1)
    
    # Split target from df
    y = df[target].copy()
    X = df.drop(target, axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    # Define model
    if task == 'regression':
        model = LinearRegression()
    elif task == 'classification':
        model = LogisticRegression()
    
    # Fit model to train set
    model.fit(X_train, y_train)
    
    # Return the test results
    return model.score(X_test, y_test)

Predicting Sex Column

In [7]:
data

Unnamed: 0,sex,length,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,rings
0,M,91,73,19,102.8,44.9,20.2,30.0,15
1,M,70,53,18,45.1,19.9,9.7,14.0,7
2,F,106,84,27,135.4,51.3,28.3,42.0,9
3,M,88,73,25,103.2,43.1,22.8,31.0,10
4,I,66,51,16,41.0,17.9,7.9,11.0,7
...,...,...,...,...,...,...,...,...,...
4172,F,113,90,33,177.4,74.0,47.8,49.8,11
4173,M,118,88,27,193.2,87.8,42.9,52.1,10
4174,M,120,95,41,235.2,105.1,57.5,61.6,9
4175,F,125,97,30,218.9,106.2,52.2,59.2,10


In [8]:
results = preprocess_and_train(data, target='sex', task='classification')

print("Sex Classification Accuracy: {:.2f}%".format(results * 100))

Sex Classification Accuracy: 57.10%


Predicting Length Column

In [9]:
results = preprocess_and_train(data, target='length', task='regression')

print("Length Regression R^2: {:.4f}".format(results))

Length Regression R^2: 0.9753


Predicting Diameter Column

In [10]:
results = preprocess_and_train(data, target='diameter', task='regression')

print("Diameter Regression R^2: {:.4f}".format(results))

Diameter Regression R^2: 0.9758


Predicting Height Column

In [11]:
results = preprocess_and_train(data, target='height', task='regression')

print("Height Regression R^2: {:.4f}".format(results))

Height Regression R^2: 0.8147


Predicting Whole-Weight Column

In [12]:
results = preprocess_and_train(data, target='whole-weight', task='regression')

print("Whole-Weight Regression R^2: {:.4f}".format(results))

Whole-Weight Regression R^2: 0.9908
