In [None]:
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
warnings.filterwarnings("ignore", category = FutureWarning)
sns.set(style = "white" , color_codes = True)

In [None]:
import pandas as pd
dataset = pd.read_csv('Iris.csv')

In [None]:
dataset

In [None]:
dataset.head(60)

In [None]:
dataset[60:90]

In [None]:
dataset.tail(60)

In [None]:
iris_setosa_data = dataset.iloc[0:50 , :]
iris_versicolor_data = dataset.iloc[51:100 , :]
iris_virginica_data = dataset.iloc[101:150 , :]

<h1> Mean, Variance, Standard Deviation </h1>

<h2> Mean </h2>

Mean is the <b>CENTRAL TENDENCY</b> or the <b> AVERAGE VALUE </b> of a set of given observations. Mathematical defination of mean is :

$$\bar{X} = \frac{\sum_{i=1}^{n} x_{i}}{n}$$

In [None]:
print("MEANS")
print(np.mean(iris_setosa_data['PetalLengthCm']) , "--setosa")
print(np.mean(iris_versicolor_data['PetalLengthCm']) , "--versicolor")
print(np.mean(iris_virginica_data['PetalLengthCm']) , "--virginica")

<h3> Observations </h3>
<p><b> MEAN </b> </p>
<p> By using the mean we can perform very initial EDA. For example, just by looking at the values of the means of the petal lenght, we can easily tell that <b>Iris Setosa</b> has a much smaller petal length on average when comapre to <b> Iris Versicolor</b> and <b>Iris Virginica</b>. 

In [None]:
#Mean with an outlier
print(np.mean(np.append(iris_setosa_data['PetalLengthCm'] , 50)))


<p> Even after 50 values all say that the petal length of setosa flowers are around 1.464 cm , If there is even a single wrong data, it can shift the mean wildly. </p>

<p> These error can happen because of human mistakes or data corruption or any other reasons. Such data points are called as <b>"OUTLIERS"</b>.

<h2>Variance</h2>

<p>Variance represents the spread of the given observations. It is the average square distance of the observation from  mean.</p>
<p> The formula for the variance of a population is given by : </p>

$$s^{2} = \frac{SS}{N} = \frac{\sum (x_{i} - \bar{x})^{2}}{N}$$

<p> Where <ul>
    <li>$SS$ is the sum of squared errors</li>
    <li>$N$ is the number of observations in the group</li>
    <li>$x_i$ is the $i^{th}$ observation in the group</li>
    <li>$\bar{x}$ is the mean of the group </li>
    
    
    
        

In [None]:
print("VARIANCE")
print(np.var(iris_setosa_data['PetalLengthCm']) , "--setosa")
print(np.var(iris_versicolor_data['PetalLengthCm']) , "--versicolor")
print(np.var(iris_virginica_data['PetalLengthCm']) , "--virginica")

In [None]:
#Variance with an outlier
print(np.var(np.append(iris_setosa_data['PetalLengthCm'] , 50)))

<h2> Standard Deviation </h2>

In [None]:
sns.FacetGrid(dataset , hue = "Species" , height = 5)\
.map(sns.distplot , "PetalLengthCm")\
.add_legend()

print("In the following graph, the standard deviation shows the spread of the graph")

<p> Standard Deviation represents the average deviation of the observations form the mean value. </p>

$$s = \sqrt{s^{2}} = \sqrt{\frac{SS}{N}} = \sqrt{\frac{\sum (x_{i} - \bar{x})^{2}}{N}}$$

<p> Where <ul>
    <li>$s^2$ is the variance</li>
    <li>$SS$ is the sum of squared errors</li>
    <li>$N$ is the number of observations in the group</li>
    <li>$x_i$ is the $i^{th}$ observation in the group</li>
    <li>$\bar{x}$ is the mean of the group </li>
    

In [None]:
print("STANDARD DEVIATION")
print(np.std(iris_setosa_data['PetalLengthCm']) , "--setosa")
print(np.std(iris_versicolor_data['PetalLengthCm']) , "--versicolor")
print(np.std(iris_virginica_data['PetalLengthCm']) , "--virginica")

In [None]:
#Standard Deviation with an outlier
print(np.std(np.append(iris_setosa_data['PetalLengthCm'] , 50)))

<h1> Median, Percentiles, Quantiles, IQR</h1>

<h3> Median </h3>
<p> Median is the statistical measure that determines the middle value of a dataset listed in ascending order. This measure divides the lower half from the upper half of the dataset. </p>

In [None]:
print("MEDIANS")
print(np.median(iris_setosa_data['PetalLengthCm']) , "--setosa")
print(np.median(iris_versicolor_data['PetalLengthCm']) , "--versicolor")
print(np.median(iris_virginica_data['PetalLengthCm']) , "--virginica")

<h2> Box-Plots</h2>

In [None]:
sns.boxplot(x='Species' , y='PetalLengthCm' , data = dataset)
plt.show()

<h2> Multivariate Probability Density</h2>

In [None]:
sns.jointplot(x = "PetalLengthCm" , y = "PetalWidthCm" , data=iris_setosa_data )
plt.show()

<h2> Multivariate Probabiliy Density using Contour Plots </h2>

In [None]:
sns.jointplot(x = "PetalLengthCm" , y = "PetalWidthCm" , data=iris_setosa_data  , kind = "kde" , cmap = "Blues" , shade=True)
plt.show()

In [None]:
#Multivariate density plots for the entire dataset
sns.jointplot(data=dataset, x="PetalLengthCm", y="PetalWidthCm", hue="Species")
print("without contours")
plt.show()
sns.jointplot(data=dataset, x="PetalLengthCm", y="PetalWidthCm", hue="Species" ,kind='kde' , shade=True)
print("with contours")
plt.show()

<h3> Observations </h3>
<ul>
    <li>This gives us a clear idea about the joint distribution of multiple variables and the marginal distributions of individual variables. These plots can be used to get information abouthow two features are dependent on each other throughout the space. </li>
    <li> This gives a more precise idea about the density of the feautres when compared to a scattered plot. This is really useful when there are many overlapping features. </li>

<h2> Correlation </h2>

In [None]:
#For iris setosa
sns.scatterplot(iris_setosa_data['PetalLengthCm'] , iris_setosa_data['PetalWidthCm'])
plt.show()
sns.scatterplot(iris_setosa_data['SepalLengthCm'] , iris_setosa_data['SepalWidthCm'])
plt.show()
no_id = iris_setosa_data.drop(['Id'] , axis=1)
sns.heatmap(no_id.corr() , annot = True)

In [None]:
#For iris versicolor
sns.scatterplot(iris_versicolor_data['PetalLengthCm'] , iris_versicolor_data['PetalWidthCm'])
plt.show()
sns.scatterplot(iris_versicolor_data['SepalLengthCm'] , iris_versicolor_data['SepalWidthCm'])
plt.show()
no_id = iris_versicolor_data.drop(['Id'] , axis=1)
sns.heatmap(no_id.corr() , annot = True)

In [None]:
#for iris virginica
sns.scatterplot(dataset['PetalLengthCm'] , dataset['PetalWidthCm'])
plt.show()
sns.scatterplot(dataset['SepalWidthCm'] , dataset['PetalLengthCm'])
plt.show()
no_id = dataset.drop(['Id'] , axis=1)
sns.heatmap(no_id.corr() , annot = True)

# Using Decision Tree Classifier

In [None]:
# Import Library for splitting data and accuracy
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Extracting Attributes / Features
X = dataset.iloc[:,:-1]

# Extracting Target / Class Labels
y = dataset.iloc[:,-1]

# Creating Train and Test datasets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 50, test_size = 0.25)

# Creating Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

# Predict Accuracy Score
y_pred = clf.predict(X_test)
print("Train data accuracy:",accuracy_score(y_true = y_train, y_pred=clf.predict(X_train)))
print("Test data accuracy:",accuracy_score(y_true = y_test, y_pred=y_pred))