# Exploratory Data Analysis on IRIS Dataset

### This notebook explores the Iris Dataset using seaborn, matplotlib and plotly.

![](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Machine+Learning+R/iris-machinelearning.png)

**The Iris Dataset contains four features (length and width of sepals and petals) of 50 samples of three species of Iris (Iris setosa, Iris virginica and Iris versicolor).**


### **Attribute Information:**

1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
 * Iris Setosa
 * Iris Versicolour
 * Iris Virginica 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as plotly
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

In [None]:
sns.set_style("whitegrid")
sns.set_palette("tab10")

In [None]:
# set the figure size and font size of all plots using rcParams
plt.rcParams["figure.figsize"] = (10,8)
plt.rcParams.update({'font.size': 17})

In [None]:
# read in the data
iris=pd.read_csv("/kaggle/input/iris-flower-dataset/IRIS.csv")

In [None]:
#Shape of data
print('The dataset has',iris.shape[0],'rows and',iris.shape[1],'columns.')

In [None]:
# column names in the dataset
print('Column Names:',iris.columns.tolist())

In [None]:
#basic information about the dataset
iris.info()

In [None]:
#datatypes of the columns
iris.dtypes

In [None]:
iris.head()

In [None]:
#summary of the dataset
iris.describe().T

In [None]:
stats=iris.describe().T

#range(max-min) of the values in the corresponding columns
stats['range']=stats['max']-stats['min']

outfields=['mean','25%','50%','75%','range']
stats=stats[outfields]
stats.rename(columns={'50%':'median'},inplace=True)
stats

In [None]:
iris['species']=iris.species.str.replace('Iris-','')
#another way of doing the above is to use apply function: iris['species'].apply(lambda x: x.replace('Iris-',''))
iris.head()

In [None]:
#count each species present
iris.species.value_counts()   

In [None]:
iris.groupby('species').mean()

In [None]:
iris.groupby('species').median()

In [None]:
iris.groupby('species').agg(['mean','median'])
iris.groupby('species').agg([np.mean,np.sum])

In [None]:
#Count no. of species
sns.countplot(data=iris,x='species',edgecolor = 'black')

In [None]:
plt.plot(iris.sepal_length,iris.sepal_width,ls='',marker='o',label='sepal')
plt.plot(iris.petal_length,iris.petal_width,ls='',marker='o',label='petal')
plt.legend()


In [None]:
ax=plt.axes()
ax.hist(iris.petal_length,bins=32,edgecolor = 'black')
ax.set(title='Distribution of petal length')

In [None]:
ax=iris.plot.hist(bins=25,alpha=0.5)
ax.set(xlabel='Size (cm)')

In [None]:
axLst=iris.hist(bins=25,grid=False,edgecolor = 'black')
for i in axLst.flatten():
    if i.is_last_row():
        i.set_xlabel('Size (cm)')
    if i.is_first_col():
        i.set_ylabel('Frequency')

In [None]:
fig,ax=plt.subplots()
ax.barh(np.arange(10),iris.sepal_width.iloc[:10],edgecolor = 'black')
ax.set_yticks(np.arange(0.4,10.4,1))
ax.set_yticklabels(np.arange(1,11))
ax.set(xlabel='Width',ylabel='count',title='Bar plot (Sepal Width)')

In [None]:
iris.groupby('species').mean()

In [None]:
iris.groupby('species').mean().plot(color=['red','green','blue','black'],title='Mean Plot')

# PairPlot

In [None]:
sns.set(font_scale=1.3)
sns.pairplot(iris,hue='species',corner=True)

In [None]:
sns.set_theme(style="dark")
g = sns.PairGrid(iris,hue='species' ,diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=2)

From the above pairplots we can conclude that the features of setosa are pretty different from the other two species.

In [None]:
sns.set_style()
sns.jointplot(y=iris['sepal_width'],x=iris['sepal_length'],kind='hex')

In [None]:
sns.set_style('darkgrid')
plot=sns.FacetGrid(iris,col='species',margin_titles=True)
plot.map(plt.hist,'sepal_width',color='green')
plot=sns.FacetGrid(iris,col='species',margin_titles=True)
plot.map(plt.hist,'sepal_length',color='blue')

# Boxplot

In [None]:
iris.boxplot(by='species')

In [None]:
fig,axes = plt.subplots(2, 2,figsize=(17,11))
sns.boxplot(ax=axes[0,0],data=iris,x='species',y='petal_length')
sns.boxplot(ax=axes[0,1],data=iris,x='species',y='petal_width')
sns.boxplot(ax=axes[1,0],data=iris,x='species',y='sepal_length')
sns.boxplot(ax=axes[1,1],data=iris,x='species',y='sepal_width')

In [None]:
#Converting dataset into long form to plot the boxplot in one graph.
iris_long=iris.set_index('species').stack().to_frame().reset_index().rename(columns={0:'size','level_1':'measurement'})

In [None]:
sns.boxplot(data=iris_long,hue='species',y='size',x='measurement')

# Correlation Plot

In [None]:
corr_mat=iris.corr().stack().reset_index(name='correlation')
g = sns.relplot(
    data=corr_mat,
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(-1, 1),height=8,sizes=(35, 160)
)
g.set(xlabel="", ylabel="", aspect="equal",title='Correlation Plot')

In [None]:
sns.heatmap(iris.corr(), annot=True)

# Violin Plot

In [None]:
fig,axes = plt.subplots(2, 2,figsize=(17,11))
sns.violinplot(ax=axes[0,0],data=iris,x='species',y='petal_length')
sns.violinplot(ax=axes[0,1],data=iris,x='species',y='petal_width')
sns.violinplot(ax=axes[1,0],data=iris,x='species',y='sepal_length')
sns.violinplot(ax=axes[1,1],data=iris,x='species',y='sepal_width')

In [None]:
sns.violinplot(data=iris_long,hue='species',y='size',x='measurement')

# Plotly Express

* ### Scatter Plots

In [None]:
px.scatter(iris,x='sepal_width',y='sepal_length',color='species')

In [None]:
px.scatter(iris,x='petal_width',y='petal_length',color='species')

* ### 3D Scatter Plot

In [None]:
fig = px.scatter_3d(iris, x='sepal_length', y='sepal_width', z='petal_width',
              color='species')
fig.show()

* ### Parallel Coordinates

In [None]:
iris['species'] = iris['species'].map({'setosa': 1, 'virginica': 2, 'versicolor': 3})
px.parallel_coordinates(iris, color="species")

### From the above plot we can conclude that petals can be used to distinguish the species.