In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 11428: Artificial Intelligence Course Assignment
# Iris Flower Data Visualization and Clustering Using Self-Organized Map (SOM)

The Iris Flower Dataset is used in this assignment. It contains 150 rows of data, 50 for each species of Iris flowers: Iris Setosa, Iris Versicolor and Iris Virginica. The features examined where the petal and sepal dimensions (length and width).

> This is what the first five rows of the dataset look like:

In [None]:
train = pd.read_csv('../input/iris-flower-dataset/IRIS.csv')
print(train.head())

# To check there are no null values (True)
# print(train.describe())

# Data Visualization:

**Horizontal bar plot to resemble the fair division of data (50 entires for each species):**

In [None]:
ax = train['species'].value_counts().plot(kind='barh', figsize=(6,6), color = sns.color_palette("deep"))

**Encoding to convert "species" categorical data into numerical data:**

In [None]:
# Storing the categories in a dictionary for mapping, easy because the number of categories are small

iris_categories={'Iris-setosa': 1,
                 'Iris-versicolor': 2, 
                 'Iris-virginica': 3}

# Apply using map 

train['species'] = train.species.map(iris_categories)

# An alternative would be using the Ordinal Encoder, the values would map to 0.0, 1.0, 2.0. A better alternative than mapping when there are many categories. 

#from sklearn.preprocessing import OrdinalEncoder
#ordinal_encoder = OrdinalEncoder()
#train[['species']] = ordinal_encoder.fit_transform(train[['species']])

**A count plot for species after encoding:**

In [None]:
ax=sns.countplot(x="species", data=train)

**A pie plot to show color representations for each species in all perceding plots:**

In [None]:
ax = train['species'].value_counts().plot(kind='pie', legend=True, figsize=(6,6))
ax.legend(iris_categories.keys())

**To extract important information from our data, and understand the correlations, the following scatter plots are implemented:**


1. The data position with regard to petal length & petal width
2. The data position with regard to sepal length & sepal width
3. The data position with regard to petal width & sepal width
4. The data position with regard to petal length & sepal length


**1. The data position with regard to petal length & petal width**

In [None]:
import matplotlib.pyplot as plt

train.plot(kind="scatter", x="petal_width", y="petal_length", alpha=0.4,
            figsize=(10,7), c="species", cmap=sns.color_palette("flare", as_cmap=True), colorbar=True)
plt.legend()

Scatter plot 1 shows that:


>     Species 1 has the smallest petal size (length and width).
> 
>     Species 2 has a large petal size.
> 
>     Species 3 has the largest petal size.
> 
>     Species 3 has the most variations in petal size.

**Conclusion: the correlation between petal size and species is strong.**

**2. The data position with regard to sepal length & sepal width**

In [None]:
train.plot(kind="scatter", x="sepal_width", y="sepal_length", alpha=0.4,
            figsize=(10,7), c="species", cmap=sns.color_palette("flare", as_cmap=True), colorbar=True)
plt.legend()

Scatter plot 2 shows that:

>     Species 1 has a generally small sepal size (length and width).
>     
>     Species 2 and 3 have similar and variant sepal sizes, sometimes smaller than in species 1.
> 

**Conclusion: the correlation between sepal size and species is weak.**


**3. The data position with regard to petal width & sepal width**

In [None]:
train.plot(kind="scatter", x="petal_width", y="sepal_width", alpha=0.4,
            figsize=(10,7), c="species", cmap=sns.color_palette("flare", as_cmap=True), colorbar=True)
plt.legend()

Scatter plot 3 shows that:

>     Species 1 has a small petal width and large sepal width.
>     
>     Species 2 and 3 have similar and variant sepal sizes, but species 3 has a larger petal size.
> 

**Conclusion: the weak correlation in sepal size can be strengthened with information about petal size.**


**4. The data position with regard to petal length & sepal length**

In [None]:
train.plot(kind="scatter", x="petal_length", y="sepal_length", alpha=0.4,
            figsize=(10,7), c="species", cmap=sns.color_palette("flare", as_cmap=True), colorbar=True),
plt.legend()

Scatter plot 4 shows that:

    Species 1 has a small sepal length and small petal length.
    
    Species 2 and 3 have similar and variant sepal lengths, but species 3 has a larger petal length.


**Conclusion: Scatter plots 3 and 4 show that petal size brings clearer information.**

# Correlation Matrix

In order to test the conclusions of the scatter plot,
a correlation matrix shows that -indeed- petal dimensions have stronger correlations than sepal dimensions.


In [None]:
corr_matrix = train.corr()
corr_matrix["species"].sort_values(ascending=False)

In order to make the sepal dimension data more useful, the following two relationships yielded better correlations:
    
>     1. Ratio of petal width to sepal width.
>     2. Ratio of petal length to sepal length. 
     
Therefore we can drop sepal_width and sepal_length and replace them with the perceding. 


In [None]:
train['petal_width:sepal_width']=train['petal_width']/train['sepal_width']
train['petal_length:sepal_length']=train['petal_length']/train['sepal_length']

**Correlation matrix after data manipulation:**

In [None]:
train = train.drop("sepal_width", axis=1)
train = train.drop("sepal_length", axis=1)

# The resulting correlation matrix is now better:

corr_matrix = train.corr()
corr_matrix["species"].sort_values(ascending=False)

In [None]:
# Dropping the species attribute

species = train['species']
train = train.drop("species", axis=1)

# Data Visualization After Manipulation

Histograms show distinctions that might help us categorize species more effeciently.
A histogram tested for sepal dimensions only did not show these distinctions, therefore we are moving in the right direction.

In [None]:
train.hist(bins=25, figsize=(10,10))

# Training for Unsupervised Learning with Self-Organizing Maps

In [None]:
# Normalizing the data is an important step before applying the Self Organized Map (SOM) algorithm

from sklearn.preprocessing import StandardScaler

trainSt = StandardScaler().fit_transform(train.values)

In [None]:
# A python library for SOM 

!pip install SimpSOM

import SimpSOM as sps

# Building a 50x50 network and initialising its weights with PCA

net = sps.somNet(50, 50, trainSt, PBC=True, PCI=True)

# Training it with 0.1 learning rate for 1500 epochs 

net.train(0.1, 1500)

# Saving weights to a file 

net.save('iris_weights')

# Displaying the map of the weights differences between nodes, to identify cluster centers 

net.nodes_graph(colnum=0)
net.diff_graph(show=True,printout=True)

# Clustering of Iris Flower Data

In [None]:
# Project the datapoints on the new 2D network map

projData=net.project(trainSt[:150])

In [None]:
# Saving a version of the nodes difference map

from PIL import Image, ImageChops

def autocrop(fileName):
    im = Image.open(fileName)
    im=im.crop((0,100,2900,im.size[1]))
    bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
    diff = ImageChops.difference(im, bg)
    diff = ImageChops.add(diff, diff, 2.0, -100)
    bbox = diff.getbbox()
    if bbox:
        return im.crop(bbox)
    
cropped = autocrop('nodesDifference.png')
cropped.save('cropped.png')

In [None]:
# Preparing a plotly graph for styled clustering of Iris data:

%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

trace0 = go.Scatter(
    x = [x for x,y in projData],
    y = [y for x,y in projData],
#    name = labels,
    hovertext = [str(n) for n in species],
    text = [str(n) for n in species],
    mode = 'markers',
    marker = dict(
        size = 8,
        color = species,
        colorscale ='Jet',
        showscale = False,
        opacity = 1,
    ),
    showlegend = False
)

data = [trace0]

layout = go.Layout(
    images= [dict(
                  source= "cropped.png",
                  xref= "x",
                  yref= "y",
                  x= -0.5,
                  y= 39.5*2/np.sqrt(3)*3/4,
                  sizex= 40.5,
                  sizey= 40*2/np.sqrt(3)*3/4,
                  sizing= "stretch",
                  opacity= 0.5,
                  layer= "below")],
    width = 800,
    height = 800,
    hovermode= 'closest',
    xaxis= dict(
        range=[-1,41],
        zeroline=False,
        showgrid=False,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        range=[-1,41],
        zeroline=False,
        showgrid=False,
        ticks='',
        showticklabels=False
    ),
    showlegend= True
)


fig = dict(data=data, layout=layout)
py.iplot(fig, filename='styled-scatter')

# Conclusion

We can see from the scatter plot that the three different species have clustered into groups. 

Blue being Iris Setosa, 

Green being Iris Versicolor,

and Red being Iris Virginica.