Setting UP: Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

Importing the dataset

In [None]:
dataset = pd.read_csv("../input/iris-flower-dataset/IRIS.csv")

Look at the Data 

In [None]:
dataset.head()

In [None]:
dataset.info()

Checking the missing value

In [None]:
dataset.isnull().count()

Now look a the outlier, and some more statistical values

In [None]:
dataset.describe()

In [None]:
X = dataset.drop(['species'], axis=1)
Y = dataset['species']

Checking the shape

In [None]:
X.shape, Y.shape

Visualizing 
1. Correlations among data

In [None]:
sns.heatmap(dataset.corr(), annot = True);
#annot = True adds the numbers onto the squares

In [None]:
sns.set_style("whitegrid")
sns.pairplot(dataset, hue="species", markers='+',size=4)
plt.show()

Modeling

The dataet consits of 3 unique types of petals

In [None]:
#Splitting the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Standardize

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

baseline Predition

In [None]:
#create the model instance
model = LogisticRegression()
#fit the model on the training data
model.fit(X_train, y_train)
#the score, or accuracy of the model
model.score(X_test, y_test)

The test score is already very high, but we can use the cross validated score to ensure the model's strength 


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=10)
print(np.mean(scores))

Coefficients


In [None]:
df_coef = pd.DataFrame(model.coef_, columns=X_train.columns)
df_coef

Prediction

In [None]:
predictions = model.predict(X_test)
#compare predicted values with the actual scores
compare_df = pd.DataFrame({'actual': y_test, 'predicted': predictions})
compare_df = compare_df.reset_index(drop = True)
compare_df

Apply K-nearest neighbour

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
#Logistic Regression
pd.DataFrame(confusion_matrix(y_test, predictions))

Confunsion Matrix

In [None]:
#KNN metric
pd.DataFrame(confusion_matrix(y_test, y_pred))

Classification Scores

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Predicting Probability

In [None]:
probs = model.predict_proba(X_test)
#put the probabilities into a dataframe for easier viewing
Y_pp = pd.DataFrame(model.predict_proba(X_test), 
             columns=['class_0_pp', 'class_1_pp', 'class_2_pp'])
Y_pp.head()