In [1]:
import pandas as pd
import seaborn as sbn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Load the data

In [2]:
dataset = r'..\data\fish_shellfish_dataset.csv'
df = pd.read_csv(dataset, header=0)

### Inspect the data

In [3]:
# View first 5 rows
df.head()

Unnamed: 0,name,type,price_dk,season_availability,weight_g,length_cm,width_cm,height_cm,cost_dk,year,freight_charge
0,Aborre,fish,188.32,Spring,706.4,128.9,15.3,10.4,133.03,2021,2.6
1,Blåmusling,shellfish,202.72,Spring,2485.0,54.8,24.1,19.4,503.76,2021,1.5
2,Line_blåmusling,shellfish,73.92,Summer,3761.7,33.8,17.2,4.1,278.06,2021,2.07
3,Canadisk_hummer,shellfish,265.14,Spring,2639.3,57.7,24.8,6.0,699.78,2021,4.2
4,Dansk_10-armet_blæksprutte,fish,169.26,Spring,2812.6,95.1,35.9,29.5,476.06,2021,3.32


In [4]:
# View last 5 rows
df.tail()

Unnamed: 0,name,type,price_dk,season_availability,weight_g,length_cm,width_cm,height_cm,cost_dk,year,freight_charge
220,Sort_hummer,shellfish,101.21,Winter,3308.4,41.7,9.1,15.0,334.84,2025,2.29
221,Stenbider,fish,58.96,Autumn,3436.6,108.3,11.3,11.6,202.62,2025,4.64
222,Torsk,fish,275.41,Winter,4420.2,145.7,38.5,11.1,1217.37,2025,3.86
223,Ørred,fish,218.06,Summer,2372.6,84.9,23.3,14.3,517.37,2025,2.58
224,Østers,fish,249.65,Autumn,3629.1,143.2,2.3,29.0,906.0,2025,4.48


In [5]:
# Returns the number of rows and columns
df.shape

(225, 11)

In [6]:
# Shows all column names
df.columns

Index(['name', 'type', 'price_dk', 'season_availability', 'weight_g',
       'length_cm', 'width_cm', 'height_cm', 'cost_dk', 'year',
       'freight_charge'],
      dtype='object')

In [7]:
# 	Shows types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 225 non-null    object 
 1   type                 225 non-null    object 
 2   price_dk             225 non-null    float64
 3   season_availability  225 non-null    object 
 4   weight_g             225 non-null    float64
 5   length_cm            225 non-null    float64
 6   width_cm             225 non-null    float64
 7   height_cm            225 non-null    float64
 8   cost_dk              225 non-null    float64
 9   year                 225 non-null    int64  
 10  freight_charge       225 non-null    float64
dtypes: float64(7), int64(1), object(3)
memory usage: 19.5+ KB


In [8]:
# Show count, mean, std, min, max (numeric only by default)
df.describe()

Unnamed: 0,price_dk,weight_g,length_cm,width_cm,height_cm,cost_dk,year,freight_charge
count,225.0,225.0,225.0,225.0,225.0,225.0,225.0,225.0
mean,167.486444,2530.842222,79.568444,19.980444,15.655111,432.692978,2023.0,3.273511
std,74.265985,1404.38536,41.61034,11.063868,8.240183,333.814721,1.417367,1.100139
min,32.87,103.3,10.1,2.2,1.1,7.3,2021.0,1.31
25%,101.88,1258.7,45.2,9.7,8.8,162.61,2022.0,2.35
50%,168.6,2534.3,80.5,20.0,16.0,365.69,2023.0,3.32
75%,236.46,3761.7,116.8,29.4,22.9,611.73,2024.0,4.2
max,298.84,4993.0,149.4,39.9,30.0,1322.23,2025.0,5.22


In [9]:
# Shows missing values pr. column
df.isnull().sum()

name                   0
type                   0
price_dk               0
season_availability    0
weight_g               0
length_cm              0
width_cm               0
height_cm              0
cost_dk                0
year                   0
freight_charge         0
dtype: int64

In [10]:
# Count of unique entries pr. column
df.nunique()

name                    45
type                     2
price_dk               223
season_availability      5
weight_g               224
length_cm              210
width_cm               173
height_cm              162
cost_dk                224
year                     5
freight_charge         172
dtype: int64

In [11]:
# Count duplicate rows
df.duplicated().sum()

0

In [12]:
# See all column data types
df.dtypes

name                    object
type                    object
price_dk               float64
season_availability     object
weight_g               float64
length_cm              float64
width_cm               float64
height_cm              float64
cost_dk                float64
year                     int64
freight_charge         float64
dtype: object

### Clean the data

#### Convert data types
* name
* type
* season_availability

In [13]:
# Convert each category into new binary
df = pd.get_dummies(df, columns=['name', 'type', 'season_availability'], drop_first=True)

#### Rename Columns
* freight_charge
* price_dk 
* cost_dk

In [14]:
df.rename(columns={'freight_charge': 'freight_charge_kr', 'price_dk':'price_kr','cost_dk':'cost_kr'})

Unnamed: 0,price_kr,weight_g,length_cm,width_cm,height_cm,cost_kr,year,freight_charge_kr,name_Blåmusling,name_Canadisk_hummer,...,name_Sort_hummer,name_Stenbider,name_Torsk,name_Ørred,name_Østers,type_shellfish,season_availability_Autumn,season_availability_Spring,season_availability_Summer,season_availability_Winter
0,188.32,706.4,128.9,15.3,10.4,133.03,2021,2.60,False,False,...,False,False,False,False,False,False,False,True,False,False
1,202.72,2485.0,54.8,24.1,19.4,503.76,2021,1.50,True,False,...,False,False,False,False,False,True,False,True,False,False
2,73.92,3761.7,33.8,17.2,4.1,278.06,2021,2.07,False,False,...,False,False,False,False,False,True,False,False,True,False
3,265.14,2639.3,57.7,24.8,6.0,699.78,2021,4.20,False,True,...,False,False,False,False,False,True,False,True,False,False
4,169.26,2812.6,95.1,35.9,29.5,476.06,2021,3.32,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,101.21,3308.4,41.7,9.1,15.0,334.84,2025,2.29,False,False,...,True,False,False,False,False,True,False,False,False,True
221,58.96,3436.6,108.3,11.3,11.6,202.62,2025,4.64,False,False,...,False,True,False,False,False,False,True,False,False,False
222,275.41,4420.2,145.7,38.5,11.1,1217.37,2025,3.86,False,False,...,False,False,True,False,False,False,False,False,False,True
223,218.06,2372.6,84.9,23.3,14.3,517.37,2025,2.58,False,False,...,False,False,False,True,False,False,False,False,True,False


### Transform the data

### Explore the data

### Visualize key features

### Handle outliers

### Scale the data 

### Engineer features

### Save the cleaned dataset