## Iris Flower Data Dictionary:
* The data set consists of 50 samples from each of three species of Iris (Iris Setosa, Iris virginica, and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters.
* The dataset contains a set of 150 records under 5 attributes - Petal Length, Petal Width, Sepal Length, Sepal width and Class(Species).    

In [None]:
# Nessasary imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_roc_curve
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up the paremeters for matplotlib visual treat
plt.rcParams['axes.labelsize'] = 15.
plt.rcParams['xtick.labelsize'] = 15.
plt.rcParams['ytick.labelsize'] = 15.
plt.rcParams['figure.figsize'] = [15.,8.]
plt.rcParams['legend.fontsize'] = 13.

In [None]:
#Import DataSet
df = pd.read_csv("../input/iris-flower-dataset/IRIS.csv")

In [None]:
df.head()  

## EDA Univariate, Bivariate and Multivariate Analysis

In [None]:
# Size of the dataset
df.size

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isna().sum()

### There is no missing value in the dataset

In [None]:
df.describe()

In [None]:
df['species'].value_counts()

### Target value is equally distributed

In [None]:
plt.figure(figsize=(15,20))
sns.set_theme(style='darkgrid')
plt.subplot(4,1,1)
sns.countplot(data=df, x="sepal_length", hue="species", palette="magma")
plt.subplot(4,1,2)
sns.countplot(data=df, x="sepal_width", hue="species", palette="magma")
plt.subplot(4,1,3)
sns.countplot(data=df, x="petal_length", hue="species", palette="magma")
plt.subplot(4,1,4)
sns.countplot(data=df, x="petal_width", hue="species", palette="magma");

### Comments:
* Hightest Sepal_length : `Iris_setosa`
* Hightst sepal_width   : `Iris_verginica`
* Hightest petal_length : `Iris_setosa`
* Hightest petal_width  : `Iris_setosa`

In [None]:
plt.figure(figsize=(20,15))
sns.set_theme(style='darkgrid')
plt.subplot(2,2,1)
sns.histplot(x=df['sepal_length'], kde=True)
plt.subplot(2,2,2)
sns.histplot(x=df['sepal_width'], kde=True)
plt.subplot(2,2,3)
sns.histplot(x=df['petal_length'], kde=True)
plt.subplot(2,2,4)
sns.histplot(x=df['petal_width'], kde=True);

###  >  sepal_length: The distribution is unimodal, range is between 4.5 to 8.0, the mean value is  5.843333333333334 and median value is 5.8

### > sepal_width: The distribution is unimodal, range is between 1.9 to 4.5, the mean value is  3.05 and median value is 3.0

### >  petal_length: The distribution is bimodal, range is between 1 to 7.0, the mean value is  3.75 and median value is 4.35

### >  petal_width: The distribution is bimodal, range is between 0 to 2.5, the mean value is  1.98 and median value is 1.3

In [None]:
sns.pairplot(data=df, hue="species");

`We can observe setosa species has different characteristics,
it has highest sepal length and sepal width while its petal length and petal with is the lowest among others. As for versicolor the the sepal length/width and petal length/width is of average size compared to the other two categories where as the virgica has the high sepal length but low sepal width, also with high petal length and width.`

## Corelation between Features of Flower Categories

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
sns.heatmap(corr_matrix, annot=True, cmap='Accent');

### As we can see petal_length is highly corelated with the species.

In [None]:
plt.figure(figsize=(20,15))
sns.set_theme(style="darkgrid")
plt.subplot(2,2,1)
sns.histplot(data=df, x='sepal_length', hue='species', kde=True)
plt.subplot(2,2,2)
sns.histplot(data=df, x='sepal_width', hue='species', kde=True)
plt.subplot(2,2,3)
sns.histplot(data=df, x='petal_length', hue='species', kde=True)
plt.subplot(2,2,4)
sns.histplot(data=df, x='petal_width', hue='species', kde=True);

In [None]:
plt.figure(figsize=(20,15))
sns.set_theme(style="darkgrid")
plt.subplot(2,2,1)
sns.boxplot(data=df,x='species', y='sepal_length')
plt.subplot(2,2,2)
sns.boxplot(data=df,x='species', y='sepal_width')
plt.subplot(2,2,3)
sns.boxplot(data=df,x='species', y='petal_length')
plt.subplot(2,2,4)
sns.boxplot(data=df,x='species', y='petal_width');

### As we can we have some outliers in virginica for sepal_length and width , sentosa and versicolor for petal_length and sentosa for  petal_width. 

## Modeling

In [None]:
x= df.drop("species",axis=1)
y = df["species"]

In [None]:
# Split into training and test set 
np.random.seed(123)
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
# Put All models in dictionary

models = {"LogisticRegression": LogisticRegression(),
         "RandomForest Classifier": RandomForestClassifier(),
         "KNN": KNeighborsClassifier(),
         }

def fit_and_score(models, x_train, x_test, y_train, y_test):
    
    np.random.seed(123)
    
    model_scores = {}
    
    for name, model in models.items():
        model.fit(x_train,y_train)
        model_scores[name] = model.score(x_test,y_test)
        
    return model_scores

In [None]:
scores = fit_and_score(models=models,
                      x_train=x_train,
                      x_test=x_test,
                      y_train=y_train,
                      y_test=y_test)
scores

## Camparing Model Results (prior hyperparameter tuning)

In [None]:
compare_model = pd.DataFrame(scores, index=["accuracy"])
compare_model.T.plot(kind="line", figsize=(10,6));

### KNN gives better results than both the models

## Hyerparameter Tuning

### Logistic Regression 

In [None]:
log_grid = [{'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
            'C' : np.logspace(-4, 4, 20),
            'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
            'max_iter' : [100, 1000,2500, 5000]
            }]

In [None]:
np.random.seed(123)
model_log = GridSearchCV(LogisticRegression(n_jobs=-1),
                        param_grid=log_grid,
                        cv=5,
                        verbose=True)
model_log.fit(x_train,y_train)

In [None]:
model_log.best_params_

In [None]:
model_log.best_score_

## RandomForestClassifier

In [None]:
rf_grid = {'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)],
            'max_features': ['auto', 'sqrt'],
            'max_depth': [2,4],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1,2],
            'bootstrap': [True, False]}

In [None]:
np.random.seed(123)
model_rf = GridSearchCV(RandomForestClassifier(),
                       param_grid=rf_grid,
                       cv=5,
                       verbose=True,
                       n_jobs=-1)
model_rf.fit(x_train, y_train)

In [None]:
model_rf.best_params_

In [None]:
model_rf.best_score_

## KNN

In [None]:
knn_grid = {"n_neighbors": [3,5,11,19],
           "weights": ["uniform", "distance"],
           "metric":["euclidean","manhattan"]}

In [None]:
np.random.seed(123)

model_knn = GridSearchCV(KNeighborsClassifier(n_jobs=-1),
                        param_grid=knn_grid,
                        cv=5,
                        verbose=True,
                        )
model_knn.fit(x_train, y_train)

In [None]:
model_knn.best_params_

In [None]:
model_knn.best_score_

## Conclusion:

### KNeighborsClassifier performs after hyperparameter tuning.

In [None]:
y_preds = model_knn.predict(x_test)

In [None]:
y_preds

In [None]:
confusion_matrix(y_test,y_preds)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test,y_preds),
           annot=True, # Annotate the boxes
                     cbar=False);

In [None]:
print(classification_report(y_test, y_preds))