In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Context**

The ***Iris flower data*** set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper The use of multiple measurements in taxonomic problems. It is sometimes called Anderson's Iris data set because Edgar Anderson collected the data to quantify the morphologic variation of Iris flowers of three related species. The data set consists of 50 samples from each of three species of Iris (Iris Setosa, Iris virginica, and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters.

**Content**

The dataset contains a set of 150 records under 5 attributes - Petal Length, Petal Width, Sepal Length, Sepal width and Class(Species).

In [None]:
import pandas as pd
data=pd.read_csv("../input/iris-flower-dataset/IRIS.csv")

In [None]:
data

**Data Dictionary:** <hr>
1) row ID. <br>
2) sepal length & width = length and width in cm i presume. <br>
3) petal length & width = also in cm.  <br>
4) species: species of the flower. <br>

In [None]:
#explore data first

data.head(10)

In [None]:
data.info()

all columns are numbers except for the species.

In [None]:
#let us see how many empty cells are there for each column

data.isnull().sum()

no null cells anywhere. 

# **Feature Engineering**


we can categorzie the species column into numbers and use that instead, to help us match all the datatypes of the columns into float64

In [None]:
#insert line to change object datatype into float64 via categorization - aka mapping
species_mapping = {"Iris-virginica": 1, "Iris-setosa": 2, "Iris-versicolor": 3}
data['species'] = data['species'].map(species_mapping)


In [None]:
#to explore the new mapping 
data.head() 

successfully categorized the "species" column.

In [None]:
#lets check on the datatypes again:
data.info()


we can see right here that, after mapping the species column, its datatype changed from `object` to `int64`

# **Modeling**

In [None]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

**importing K-fold**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [None]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
target = data['species']
data = data.drop('species', axis=1)
score = cross_val_score(clf, data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# kNN Score
round(np.mean(score)*100, 2)

In [None]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# decision tree Score
round(np.mean(score)*100, 2)

In [None]:
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# Random Forest Score
round(np.mean(score)*100, 2)

In [None]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
# Naive Bayes Score
round(np.mean(score)*100, 2)

In [None]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
round(np.mean(score)*100,2)

# Clustering 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data.head() 

In [None]:
#sepal length and width Correlation
sns.pairplot(data[['sepal_length','sepal_width']])

In [None]:
#perform K-Mean Clustering 
import sklearn.cluster as cluster

In [None]:
#kmeans w two variables
kmeans = cluster.KMeans(n_clusters=2 ,init="k-means++")
kmeans = kmeans.fit(data[['sepal_width','petal_width']])

In [None]:
kmeans.cluster_centers_

# attaching cluster with original data

In [None]:
data['Clusters'] = kmeans.labels_

In [None]:
data.head()

In [None]:
data['Clusters'].value_counts()

In [None]:
sns.scatterplot(x="sepal_width", y="petal_width",hue = 'Clusters',  data=data)