In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
file = '/kaggle/input/iris-flower-dataset/IRIS.csv'
df = pd.read_csv(file)

In [None]:
df.head()

# EXPLORATORY DATA ANALYSIS

In [None]:
df.shape # Understand the row and column of dataset

In [None]:
df.isnull().sum() # Check the null value of dataset

In [None]:
df['species'].value_counts() # check how many categorical values present

In [None]:
df['species'].value_counts().plot.bar()

In [None]:
sns.countplot(df['species'])

In [None]:
sns.histplot(df['sepal_length'])

In [None]:
sns.distplot(df['sepal_length'])

In [None]:
sns.barplot(x=df['species'],y=df['sepal_length'])

In [None]:
sns.distplot(df['sepal_width'])

In [None]:
sns.barplot(x=df['species'],y=df['sepal_width'])

In [None]:
sns.distplot(df['petal_length'])

In [None]:
sns.barplot(x=df['species'],y=df['petal_length'])

In [None]:
sns.distplot(df['petal_width'])

In [None]:
sns.barplot(x=df['species'],y=df['petal_width'])

In [None]:
# We have observed that petal length and petal width are not normalised so i can handle in later stages also handle
# target varibale

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.boxplot(data=df, width= 0.5,  fliersize=6)


# Feature Engineering

In [None]:
df['species']=df['species'].map({'Iris-setosa':0 ,'Iris-versicolor':1,'Iris-virginica':2 })


In [None]:
df['species'].value_counts()

In [None]:
df.corr()

In [None]:
X = df.drop(columns = ['species'])
y = df['species']

In [None]:
# let's see how data is distributed for every column
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in X:
    if plotnumber<=9 :
        ax = plt.subplot(3,3,plotnumber)
        sns.stripplot(y,X[column])
    plotnumber+=1
plt.tight_layout()

In [None]:
from sklearn.preprocessing import StandardScaler 
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

In [None]:
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

# Feature Selection

In [None]:
vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(X_scaled,i) for i in range(X_scaled.shape[1])]
vif["Features"] = X.columns

#let's check the values
vif


All the VIF values are less than 5 and are very low. That means no multicollinearity. Now, we can go ahead with fitting our data to the model. Before that, let's split our data in test and training set.

feature of sepal_length ,petal_length and petal_width have highest multicollinearity so we can delete this

In [None]:
X_new = df[['sepal_length','sepal_width','petal_width']]

In [None]:
X_scaled_new = scalar.fit_transform(X_new)

In [None]:
vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(X_scaled_new,i) for i in range(X_scaled_new.shape[1])]
vif["Features"] = X_new.columns

#let's check the values
vif


All the VIF values are less than 5 and are very low. That means no multicollinearity. Now, we can go ahead with fitting our data to the model. Before that, let's split our data in test and training set.

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_scaled_new,y, test_size= 0.25, random_state = 355)

In [None]:
log_reg = LogisticRegression()

log_reg.fit(x_train,y_train)

In [None]:
y_pred = log_reg.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

In [None]:
# Confusion Matrix
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

In [None]:
clf.score(x_test,y_test)

Let's now try to tune some hyperparameters using the GridSearchCV algorithm.

GridSearchCV is a method used to tune our hyperparameters. We can pass different values of hyperparameters as parameters for grid search. It does a exhaustive generation of combination of different parameters passed. Using cross validation score, Grid Search returns the combination of hyperparameters for which the model is performing the best.

In [None]:
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [None]:
grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
best_parameters = grid_search.best_params_
print(best_parameters)

In [None]:
grid_search.best_score_

In [None]:
# Now try this parameter

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth =21, min_samples_leaf= 4, min_samples_split= 4, splitter ='random')
clf.fit(x_train,y_train)

In [None]:
clf.score(x_test,y_test) # But possible may be low bias and high variance present for unknown dataset

Great!! Our test score has improved after using Gridsearch.

Note : we must understand that giving all the hyperparameters in the gridSearch doesn't gurantee the best result. We have to do hit and trial with parameters to get the perfect score.

