<h1 style="margin-left: 20px; color:blue">1. Importing the required libraries</h1>

In [1]:
import requests
import zipfile
import io
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder
import category_encoders as ce

<h1 style="margin-left: 20px; color:blue">2. Loading the data into the data frame</h1>

In [2]:
def load_abalone_data(url):
    # Step 1: Download the zip file from the given URL
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download file from {url}")
    
    # Step 2: Unzip the file
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    
    # Step 3: Read the abalone.data file into a pandas DataFrame
    with zip_file.open('abalone.data') as data_file:
        column_names = [
            "Sex",  # Nominal
            "Length",  # Continuous
            "Diameter",  # Continuous
            "Height",  # Continuous
            "Whole weight",  # Continuous
            "Shucked weight",  # Continuous
            "Viscera weight",  # Continuous
            "Shell weight",  # Continuous
            "Rings"  # Integer
        ]
        df = pd.read_csv(data_file, header=None, names=column_names)
    
    return df

# Example usage
url = 'https://archive.ics.uci.edu/static/public/1/abalone.zip'
df = load_abalone_data(url)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


<h1 style="margin-left: 20px; color:red">1. Label Encoding (Міткове кодування):</h1>

>Використовується для заміни категоріальних значень числовими мітками.

In [3]:
label_encoder = LabelEncoder()
df['Sex_LabelEncoded'] = label_encoder.fit_transform(df['Sex'])
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_LabelEncoded
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,2
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,2
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,2
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1


<h1 style="margin-left: 20px; color:red">2. One-Hot Encoding (Індикаторне кодування):</h1>

>Створюється бінарний стовпчик для кожної категорії, де 1 позначає належність до категорії.

In [4]:
df = load_abalone_data(url)
df = pd.get_dummies(df, columns=['Sex'], prefix='Sex')
df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,False,False,True
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,False,False,True
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,True,False,False
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,False,False,True
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,False,True,False


<h1 style="margin-left: 20px; color:red">3. Binary Encoding (Бінарне кодування):</h1>

>Кожна категорія замінюється на бінарний вектор, що представляє позицію категорії.

In [5]:
df = load_abalone_data(url)
encoder = ce.BinaryEncoder(cols=['Sex'])
df = encoder.fit_transform(df)
df.head()

Unnamed: 0,Sex_0,Sex_1,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


<h1 style="margin-left: 20px; color:red">4. Ordinal Encoding (Порядкове кодування):</h1>

>Використовується для присвоєння порядкових числових значень категоріям.

In [6]:
df = load_abalone_data(url)
encoder = OrdinalEncoder(cols=['Sex'], mapping=[{'col': 'Sex', 'mapping': {'M': 1, 'F': 2, 'I': 3}}])
df = encoder.fit_transform(df)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


<h1 style="margin-left: 20px; color:red">5. BaseN Encoding::</h1>

>Кодування на основі системи числення з базою N.

In [7]:
df = load_abalone_data(url)
encoder = ce.BaseNEncoder(cols=['Sex'], base=2) # set base 2
df = encoder.fit_transform(df)
df.head()

Unnamed: 0,Sex_0,Sex_1,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [8]:
df = load_abalone_data(url)
encoder = ce.BaseNEncoder(cols=['Sex'], base=3) # set base 3
df = encoder.fit_transform(df)
df.head()

Unnamed: 0,Sex_0,Sex_1,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


<h1 style="margin-left: 20px; color:red">6. Hashing Encoding:</h1>

>Кожна категорія перетворюється на хеш-значення фіксованої довжини.

In [9]:
df = load_abalone_data(url)
encoder = ce.HashingEncoder(cols=['Sex'], n_components=3)
df = encoder.fit_transform(df)
df.head()

Unnamed: 0,col_0,col_1,col_2,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,1,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


<h1 style="margin-left: 20px; color:red">7. Target Encoding (Кодування цільової змінної):</h1>

>Замінюється середнє значення цільової змінної для кожної категорії.

In [10]:
df = load_abalone_data(url)
encoder = ce.TargetEncoder(cols=['Sex'])
df = encoder.fit_transform(df, df['Rings'])
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,10.705497,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,10.705497,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,11.129304,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,10.705497,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,7.890462,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


<h1 style="margin-left: 20px; color:red">8. Sum Encoding (Суммарне кодування):</h1>

>Використовується для заміни категорій сумарними значеннями.

In [11]:
df = load_abalone_data(url)
encoder = ce.SumEncoder(cols=['Sex'])
df = encoder.fit_transform(df)
df.head()



Unnamed: 0,intercept,Sex_0,Sex_1,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,1.0,0.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,1.0,0.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.0,1.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,1.0,0.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,-1.0,-1.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
