### Iris Model Development

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# Load the Iris data
data = pd.read_csv("data/Iris_data.csv")

# View the data
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Check the shape of the dataset
data.shape

(150, 5)

In [4]:
# Check the column names of the dataset
data.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [5]:
# Count the number of observations per species
data['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [6]:
# Split the data int X and y
X = data.drop(['Species'], axis=1)
y = data['Species']

In [13]:
# Split the data for training and testing at a ratio of 80/20
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.2)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Run prediction and print acuracy score
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


In [14]:
# Run a test prediction
model.predict(np.array([[2,3,4,5]]))

array(['Iris-virginica'], dtype=object)

In [15]:
# Save the model (serialize)
import pickle
pickle.dump(model, open("iris_model_jan_2020_v1.pkl", "wb"))

In [16]:
# Relaod the model  (deseralize)
model_pk = pickle.load(open("iris_model_jan_2020_v1.pkl", "rb"))

# Rerun predictions
model_pk.predict(np.array([[2,3,4,5]]))

array(['Iris-virginica'], dtype=object)