In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
dataset = pd.read_csv('../input/iris-flower-dataset/IRIS.csv')

In [None]:
print(dataset.head())
print(dataset.info())
print(dataset.describe())
print(dataset.shape)
print(dataset.groupby('species').size())
#View data. Good to see everything together to compare. 

In [None]:
fig = px.strip(dataset, x = list(dataset.columns)[:-1], title='Iris dataset boxplot', color = dataset.species)
fig.show()

In [None]:
fig = px.scatter(dataset, x="sepal_width", y="sepal_length", color="species")
fig.show()

fig = px.scatter(dataset, x="petal_width", y="petal_length", color="species")
fig.show()

In [None]:
X = dataset.drop('species', axis=1)
Y = dataset.species

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.25, random_state=99)

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear',multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

results = []
names = []
Score_df = pd.DataFrame()

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=99, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    Score_df[name] = cv_results
    print('%s: mean - %f std - %f' % (name, cv_results.mean(), cv_results.std()))
    #print(confusion_matrix(Y_val, predictions))
    #print(classification_report(Y_val, predictions))

In [None]:
fig = px.box(Score_df, title='Cross Validation Results')
fig.show()

KNN, SVM and LDA have the best scores. 

In [None]:
model = SVC(gamma='auto')
model.fit(X_train, Y_train)
predictions = model.predict(X_val)
print(accuracy_score(Y_val, predictions))
print(confusion_matrix(Y_val, predictions))
print(classification_report(Y_val, predictions))

In [None]:
model2 = KNeighborsClassifier()
model2.fit(X_train, Y_train)
predictions2 = model2.predict(X_val)
print(accuracy_score(Y_val, predictions2))
print(confusion_matrix(Y_val, predictions2))
print(classification_report(Y_val, predictions2))

In [None]:
model3 = LinearDiscriminantAnalysis()
model3.fit(X_train, Y_train)
predictions3 = model3.predict(X_val)
print(accuracy_score(Y_val, predictions3))
print(confusion_matrix(Y_val, predictions3))
print(classification_report(Y_val, predictions3))