In [13]:
######################################################
##Transforming features for better clustering#########
######################################################
# Feature standardization improves clustering
import warnings
warnings.filterwarnings('ignore')

# samples is an array giving measurements of fish. 
# Each row represents an individual fish. 
# The measurements, such as weight in grams, length in centimeters, 
# and the percentage ratio of height to length, have very different scales. 
# In order to cluster this data effectively, you'll need to standardize these features first. 
# You'll build a pipeline to standardize and cluster the data.


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

samples = pd.read_csv("fish.csv", header=None)

print(samples.head(), '\n')

samples=np.array(samples)
print(samples[:5], '\n')
samples = np.delete(samples, 0, axis=1)
print(samples[:5])
# samples is the 2D array of fish measurements.


       0      1     2     3     4     5     6
0  Bream  242.0  23.2  25.4  30.0  38.4  13.4
1  Bream  290.0  24.0  26.3  31.2  40.0  13.8
2  Bream  340.0  23.9  26.5  31.1  39.8  15.1
3  Bream  363.0  26.3  29.0  33.5  38.0  13.3
4  Bream  430.0  26.5  29.0  34.0  36.6  15.1 

[['Bream' 242.0 23.2 25.4 30.0 38.4 13.4]
 ['Bream' 290.0 24.0 26.3 31.2 40.0 13.8]
 ['Bream' 340.0 23.9 26.5 31.1 39.8 15.1]
 ['Bream' 363.0 26.3 29.0 33.5 38.0 13.3]
 ['Bream' 430.0 26.5 29.0 34.0 36.6 15.1]] 

[[242.0 23.2 25.4 30.0 38.4 13.4]
 [290.0 24.0 26.3 31.2 40.0 13.8]
 [340.0 23.9 26.5 31.1 39.8 15.1]
 [363.0 26.3 29.0 33.5 38.0 13.3]
 [430.0 26.5 29.0 34.0 36.6 15.1]]


In [14]:
########################################################
# Scaling fish data for clustering
########################################################

# Perform the necessary imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Create scaler: scaler
scaler = StandardScaler()

# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)

# Create pipeline: pipeline that chains scaler and kmeans
pipeline = make_pipeline(scaler, kmeans)

# Now that you've built the pipeline, 
# you'll use it to cluster the fish by their measurements.
    

In [15]:
########################################################
# Clustering the fish data
########################################################

# You'll now use your standardization and clustering pipeline 
# to cluster the fish by their measurements, 
# and then create a cross-tabulation to compare the cluster labels with the fish species.

# the species of every fish sample is given by the list species.
species = ['Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Bream',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Roach',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Smelt',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike',
 'Pike']


In [16]:
# Fit the pipeline to samples
pipeline.fit(samples)

# Calculate the cluster labels: labels
labels = pipeline.predict(samples)

# Create a DataFrame with labels and species as columns: df
df = pd.DataFrame({'labels':labels, 'species':species})

# Create crosstab: ct
ct = pd.crosstab(df["labels"], df["species"])

# Display ct
print(ct)

# It looks like the fish data separates really well into 4 clusters!


species  Bream  Pike  Roach  Smelt
labels                            
0           33     0      1      0
1            0     0      0     13
2            0    17      0      0
3            1     0     19      1
