# Fish type clustering

![fish](https://t3.ftcdn.net/jpg/04/36/80/36/360_F_436803668_6JDr0opcxWT6j6vD65BAxKRJJ6dwH8sE.jpg)

Source:
http://jse.amstat.org/jse_data_archive.html

## Import modules

In [10]:
# Perform the necessary imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Import Normalizer
from sklearn.preprocessing import Normalizer
# Import pandas
import pandas as pd

# Import PCA
from sklearn.decomposition import PCA

## Dataset

In [11]:
# Read Dataset as csv file
fish_df = pd.read_csv('fish.csv')

fish_df.columns = ['species', 'weight', 'length1', 'length2', 'length3', 'Height%','Width%']

# print Dataset
print(fish_df.head())
print(fish_df['species'].unique())

X_fish_df = fish_df.drop('species', axis=1)
species = fish_df['species']

samples = X_fish_df.to_numpy()

  species  weight  length1  length2  length3  Height%  Width%
0   Bream   290.0     24.0     26.3     31.2     40.0    13.8
1   Bream   340.0     23.9     26.5     31.1     39.8    15.1
2   Bream   363.0     26.3     29.0     33.5     38.0    13.3
3   Bream   430.0     26.5     29.0     34.0     36.6    15.1
4   Bream   450.0     26.8     29.7     34.7     39.2    14.2
['Bream' 'Roach' 'Smelt' 'Pike']


## Scaling

In [12]:
# Create scaler: scaler
scaler = StandardScaler()

# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)

## Clustering

In [13]:
# Fit the pipeline to samples
pipeline.fit(samples)

# Calculate the cluster labels: labels
labels = pipeline.predict(samples)

# Create a DataFrame with labels and species as columns: df
df = pd.DataFrame({'labels': labels, 'species': species})

# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['species'])

# Display ct
print(ct)


species  Bream  Pike  Roach  Smelt
labels                            
0            0    17      0      0
1           33     0      1      0
2            0     0      0     13
3            0     0     19      1




## Dimension reduction

In [14]:
scaler = StandardScaler()
scaled_samples = scaler.fit_transform(samples)

# Create a PCA model with 2 components: pca
pca = PCA(n_components=2)

# Fit the PCA instance to the scaled samples
pca.fit(scaled_samples)

# Transform the scaled samples: pca_features
pca_features = pca.transform(scaled_samples)

# Print the shape of pca_features
print(pca_features.shape)


(84, 2)


## Result
The fish data separates really well into 4 clusters!