In [None]:
#importing pandas library for read data from system
import pandas as pd

### Data Set Information:

This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.

Predicted attribute: class of iris plant.

This is an exceedingly simple domain.

This data differs from the data presented in Fishers article (identified by Steve Chadwick, spchadwick '@' espeedaz.net ). The 35th sample should be: 4.9,3.1,1.5,0.2,"Iris-setosa" where the error is in the fourth feature. The 38th sample: 4.9,3.6,1.4,0.1,"Iris-setosa" where the errors are in the second and third features.

#### Attribute Information:

1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
* -- Iris Setosa
* -- Iris Versicolour
* -- Iris Virginica


In [None]:
#reading csv data from system and store into data variable
data=pd.read_csv('../input/iris-flower-dataset/IRIS.csv')

In [None]:
#display top 5 rows of data
data.head()

In [None]:
data.dtypes

In [None]:
data.describe()

#### Standardize a dataset along any axis.

Center to the mean and component wise scale to unit variance.

In [None]:
from sklearn.preprocessing import scale

In [None]:
features=scale(data[['sepal_length','sepal_width','petal_length','petal_width']])

In [None]:
pd.DataFrame(features).describe()

In [None]:
#set species column as target column
target=data['species']

In [None]:
#checking unique value counts
target.value_counts()

In [None]:
#label encoder from sklearn for label encoding
from sklearn.preprocessing import LabelEncoder

In [None]:
#coverting text value into numerical value
target=pd.Series(LabelEncoder().fit_transform(target))

In [None]:
#check unique value counts
target.value_counts()

In [None]:
#importing train test split from sklearn model selection package for data split 25% for test
from sklearn.model_selection import train_test_split

In [None]:
#split data for train and test
x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.15)

In [None]:
#printing data lengths/number of rows/data
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

In [None]:
#import naive bayes classifier from sklearn package
from sklearn.naive_bayes import GaussianNB

In [None]:
#created a object for the classifier
bc=GaussianNB()

In [None]:
#train usinge train data
bc.fit(x_train,y_train)

In [None]:
#predict target data for test feature data and save into pred variable
pred=bc.predict(x_test)

In [None]:
#importing metrics
from sklearn.metrics import precision_score,recall_score,f1_score,classification_report,accuracy_score

In [None]:
print('accuracy:',accuracy_score(pred,y_test))
print('precision:',precision_score(pred,y_test,average='micro'))
print('recall:',recall_score(pred,y_test,average='micro'))
print('f1_score:',f1_score(pred,y_test,average='micro'))

* **binary**: Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary.
* **micro**: Calculate metrics globally by counting the total true positives, false negatives and false positives.
* **macro**: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
* **weighted**: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.


In [None]:
print(classification_report(pred,y_test))