**DECLARING IMPORT FUNCTIONS FOR THE DATA ANALYIS**

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import os
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
dataset_read=pd.read_csv('../input/iris-flower/iris.csv')

In [3]:
dataset_read

**displaying first 10 rows of the dataset**

In [4]:
dataset_read.head(10)

**displaying last 10 data from data set**

In [5]:
dataset_read.tail()

**returning description of the data in the DataFrame.**

In [6]:
dataset_read.describe()

**getting the information of column  relates in dataset**

In [7]:
dataset_read.columns

**we can see there are 5 columns now go to the data type**

In [8]:
dataset_read.dtypes

**checking for unique data**

In [9]:
dataset_read.nunique

**Renaming column name for better visualization **

In [10]:
dataset_updated=dataset_read.rename(columns={'sepal length in cm':'sepal_length','petal length in cm':'petal_length','sepal width in cm':'sepal_width','petal width in cm':'petal_width','class':'species'})

In [11]:
print(dataset_updated)

 **after getting all the necessary information now we can start plotting are graphs**

# Visualization is the key to change the world

In [12]:
sns.lineplot(x=dataset_updated["sepal_length"],y=dataset_updated["petal_length"])


*Here is the comparision between sepal length and petal length*

In [13]:
sns.scatterplot(x=dataset_updated["sepal_length"],y=dataset_updated["petal_length"])
plt.title("sepal length vs petal length")



In [14]:
plt.plot(dataset_updated['sepal_width'])
plt.plot(dataset_updated['petal_width'])
plt.legend(["sepal_width", "petal_width"])


In [15]:
plt.plot(dataset_updated['sepal_width'])
plt.plot(dataset_updated['petal_width'])
plt.plot(dataset_updated['sepal_length'])
plt.plot(dataset_updated['petal_length'])
plt.legend(["sepal_width", "petal_width","sepal_length","petal_width"])
plt.rcParams["figure.figsize"]=(18,5)


# **i used plt.rcParams["figure.figsize"] where width= 18 height=15**

In [16]:
sns.lmplot( x="sepal_length", y="sepal_width", data=dataset_updated, hue='species', legend=False)

plt.legend(["sepal_width","sepal_length"])

# LETS START ANALYSING THE SPECIES

# A Multivariate Analysis

In [17]:
sns.pairplot(dataset_updated, hue = "species",palette='CMRmap_r')


**after graphing the features in a pair plot,** it is clear that the relationship between pairs of features of a **iris-setosa (yellow)** is distinctly different from those of the other two species.

There is some overlapping going  in the pairwise relationships of the other two species,**iris-versicolor (orange)** and **iris-virginica (blue)**.

# Bar plot for sepal width,sepal length, petal width, petal length

In [18]:
fig,axes=plt.subplots(1,4,figsize=(20,5))

dataset_updated['sepal_length'].hist(ax=axes[0],color="r").set_title("sepal_length")
dataset_updated['petal_length'].hist(ax=axes[1],color="b").set_title("petal_length")
dataset_updated['sepal_width'].hist(ax=axes[2],color="g").set_title("sepal_width")
dataset_updated['petal_width'].hist(ax=axes[3],color="m").set_title("petal_width")

 

**Bar plot to understand about species**

In [19]:
fig,axes=plt.subplots(2,2,figsize=(16,5))
plt.subplots_adjust(left=0.1,
                     bottom=0.1,
                    right=0.9, 
                    top=1
                    )
sns.barplot(x=dataset_updated["species"],y=dataset_updated["sepal_length"],palette='cool',ax=axes[0][0]).set_title('species vs sepal_length')
sns.barplot(x=dataset_updated["species"],y=dataset_updated["sepal_width"],palette='cool',ax=axes[1][0]).set_title('species vs sepal_width')
sns.barplot(x=dataset_updated["species"],y=dataset_updated["petal_length"],palette='CMRmap_r',ax=axes[0][1]).set_title('species vs petal_length')
sns.barplot(x=dataset_updated["species"],y=dataset_updated["petal_width"],palette='CMRmap_r',ax=axes[1][1]).set_title('species vs petal_width')


 

# conclusion from bivariative analysis

* **sepal length plot 1**

**iris-setosa** has the shorter sepal length and **iris virginica** has longer

* **petal length plot 2**

**iris-setosa** has the shorter petal length and **Iris virginica** has longer

* **sepal width plot 3**

**iris-setosa** has the longer sepal width then **Iris virginica**

* **petal width plot 4**

**iris-setosa** has the shorter petal width and **Iris virginica** has longer


In [20]:
dataset_updated['species'].value_counts().plot.pie(explode=[0.04,0.04,0.04],shadow=True,autopct='%1.2f%%',colors=["lightcoral","lightpink","lightblue"],).set_title("iris species classifications")

**correlation is used to exclude the data which is not available or the data is non-numeric**

In [21]:
correlation=dataset_updated.corr()

In [22]:

sns.heatmap(correlation,annot=True,cmap='CMRmap_r')
plt.rcParams["figure.figsize"]=(5,5)

**CLEANING THE DUPLICATED DATA**

In [23]:
 dataset_updated[dataset_updated.duplicated()]

**GEtting the count of duplicate**

In [24]:
 dataset_updated.duplicated().value_counts()

In [25]:
dataset_updated.drop_duplicates(inplace=True)
dataset_updated.shape

In [26]:
 dataset_updated.duplicated().any()

**HERE WE CAN SEE THAT DUPLICATED DATA HAS BEEN REMOVED**

# MODEL IMPLEMENTATION

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import PolynomialFeatures 
from sklearn import svm
from sklearn.metrics import classification_report

 # **first we need to process label encoder for species**
>  from sklearn import preprocessing

1.  iris-setosa
1.  iris-versicolor
1.  iris-virginica

**Need to convert this non -numeric value into numeric value which are understand by machine and used for training the model**

In [28]:
from sklearn import preprocessing
label_encoder =preprocessing.LabelEncoder()
dataset_updated['species']= label_encoder.fit_transform(dataset_updated['species'])
  
dataset_updated['species'].unique()

**Now the value of this species has been converted into array**
1.  iris-setosa      **==0**
1.  iris-versicolor  **==1**
1.  iris-virginica   **==2**

In [29]:
dataset_updated["species"].head()

**Splitting the data**

In [30]:
from sklearn.model_selection  import train_test_split

In [31]:
x = dataset_updated.drop(['species'],axis=1)
# independent variable

In [32]:
# independent variable
y = dataset_updated['species']

In [33]:
x_sepal_train,x_sepal_test,y_species_train,y_species_test = train_test_split(x,y,random_state=0,test_size=0.3)

In [34]:
x_sepal_train.shape, x_sepal_test.shape, y_species_train.shape, y_species_test.shape

In [35]:
x_sepal_train

In [36]:
x_sepal_test

In [37]:
y_species_train

In [38]:
y_species_test

# **Supervised linear regression algorithms**

**Simple Linear regression is a basic and commonly used type of predictive analysis.**



In [39]:
from sklearn.linear_model import LinearRegression

In [40]:
model=LinearRegression()

In [41]:
#feed data into model
model.fit(x_sepal_train,y_species_train)
y_species_predicted=model.predict(x_sepal_test)


In [42]:
from sklearn.metrics import accuracy_score

In [43]:
sc_lr = round(model.score(x_sepal_test, y_species_test) * 100 , 2)

print("Accuracy: ", str(sc_lr) , " %" )

In [44]:
print(model.intercept_)
print(model.coef_)


In [45]:
 y_pred = model.intercept_  * x_sepal_test+ model.coef_
print(y_pred)

In [46]:
y_pred = model.predict(x_sepal_test)
print(y_pred)

In [47]:
dataset_updated = pd.DataFrame({'Actual': y_species_test, 'Predicted': y_pred})
dataset_updated

In [48]:

plt.plot(y_pred)
plt.xlabel("predicted value of sepal ")
plt.ylabel("according to the species type ")
plt.rcParams["figure.figsize"]=(4,8)


In [49]:
plt.hist(y_pred)

**2 logistic regression**

In [50]:
from sklearn import linear_model
logistic_model=linear_model.LogisticRegression(max_iter=130)
logistic_model.fit(x_sepal_train,y_species_train)

In [51]:

y_species_logistic_predicted=logistic_model.predict(x_sepal_test)

#species iris senota

In [52]:
sc_logr = round(logistic_model.score(x_sepal_test, y_species_test) * 100,2)
print("Accuracy: ", str(sc_logr) ,  " %")

In [53]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_species_test,y_species_test)
cm

In [54]:
sns.heatmap(cm,annot=True,cmap='BuPu_r')
plt.rcParams["figure.figsize"]=(10,2)
plt.xlabel('predicted value')
plt.ylabel('species')

# KNN MODEL 3

In [55]:
# Fitting clasifier to the Training set
# Loading libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

# Instantiate learning model (k = 3)
knn_model = KNeighborsClassifier(n_neighbors=3)

# Fitting the model
knn_model.fit(x_sepal_train, y_species_train)

# Predicting the Test set results
y_knn_pred = knn_model.predict(x_sepal_test)

In [56]:
# A detailed classification report
from sklearn.metrics import classification_report
print(classification_report(y_species_test, y_knn_pred))


In [57]:
knn_cm=confusion_matrix(y_species_test, y_knn_pred)
knn_cm

In [58]:
accuracy = accuracy_score(y_species_test, y_knn_pred)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

In [59]:
sns.heatmap(knn_cm,annot=True,cmap='BuPu_r')
plt.rcParams["figure.figsize"]=(10,2)
plt.xlabel('predicted value')
plt.ylabel('species')

In [60]:
scores_plt = [sc_lr,sc_logr,accuracy]
algorithms = ["Linear Regression","Logistic Regression","KNN"]
sns.set(rc={'figure.figsize':(11,6)})
plt.xlabel("Algorithms")
plt.ylabel("Accuracy score")

sns.barplot(algorithms,scores_plt)

# Evaluation

**knn model**

In [61]:
X_new = np.array([[3, 2, 1, 0.2], [  4.9, 2.2, 3.8, 1.1 ], [  5.3, 2.5, 4.6, 1.9 ]])
#Prediction of the species from the input vector
prediction = knn_model.predict(X_new)
print("Prediction of Species: {}".format(prediction))

**logistic model**

In [62]:
X_new = np.array([[3, 2, 1, 0.2], [  4.9, 2.2, 3.8, 1.1 ], [  5.3, 2.5, 4.6, 1.9 ]])
#Prediction of the species from the input vector
prediction = logistic_model.predict(X_new)
print("Prediction of Species: {}".format(prediction))

In [63]:
X_new = np.array([[4.6,3.4, 1.4,0.3 ]])
#Prediction of the species from the input vector
prediction = logistic_model.predict(X_new)
print("Prediction of Species: {}".format(prediction))

# iris-setosa ==0
# iris-versicolor ==1
# iris-virginica ==2