<a href="https://colab.research.google.com/github/sean-condie/from_colaboratory/blob/main/Iris_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A predictive model of flower type using the classic "iris" dataset (addapted from google lessons)

In [None]:
!pip install -q sklearn #not included by default

In [None]:
%tensorflow_version 2.x

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf


In [None]:
#import some data
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') #training
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') #testing
#isolate the dependant variable we will be testing for from the dataframe
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

#diferentiate categorical and numeric data
CATAGORICAL_DATA = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
NUMERICAL_DATA = ['age', 'fare']

#convert all columns to numeric
feature_columns = []
for feature_name in CATAGORICAL_DATA:
  vocabulary = dftrain[feature_name].unique() #collect all the unique values within the specified column
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERICAL_DATA:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(feature_columns)

[VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, def

Batches: A subset of the entire dataset

Epochs: Number of times each batch is seen

Input Function: Meant to set up the data as a tf.data.Dataset object, the main object we will work with.

*recall that "label" is the value that we are predicting/training for


In [None]:
#what is the behaviour of sending a dataset to a dict object?
df_test = dict(dftrain)
#each row has a list of keys taken from the column names
print(df_test['sex'][50]) #select the sex of the 50th row, as an example

male


In [None]:
#create the input function
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32): 
  def input_function(): #this is the function that will be returned
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) #create the dataset object with the data and label provided
    if shuffle: #this can be turned off by the user if shuffle is set to false in the calling statement
      ds = ds.shuffle(1000) #randomize the data
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

We are now ready to make the model using linear regression

In [None]:
#create the model
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
#train the model
linear_est.train(train_input_fn)
#evaluate the model
result = linear_est.evaluate(eval_input_fn)

clear_output() 
print(result['accuracy'])

0.7386364


We can look at the result which is a dict of result values with key names:

In [None]:
print(result)

{'accuracy': 0.7386364, 'accuracy_baseline': 0.625, 'auc': 0.8345577, 'auc_precision_recall': 0.78871053, 'average_loss': 0.47621182, 'label/mean': 0.375, 'loss': 0.4686921, 'precision': 0.64705884, 'prediction/mean': 0.3962939, 'recall': 0.6666667, 'global_step': 200}



END OF LINEAR REGRESSION

---

START OF CLASSIFICATION



In [None]:
#import the data
train_path = tf.keras.utils.get_file("iris_training.csv", "https://storage.googleapis.com/download.tensorflow.org/data/iris_training.csv")
test_path = tf.keras.utils.get_file("iris_test.csv", "https://storage.googleapis.com/download.tensorflow.org/data/iris_test.csv")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

#look at the data to see what is going on
print(train.head)

#column names are absent so we must create some names from the documentation
COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Setosa', 'Versicolor', 'Virginica']

#reload the data without a header and manualy assigned column names
train = pd.read_csv(train_path, names=COLUMN_NAMES, header=0)
test = pd.read_csv(test_path, names=COLUMN_NAMES, header=0)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/iris_training.csv
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/iris_test.csv
<bound method NDFrame.head of      120    4  setosa  versicolor  virginica
0    6.4  2.8     5.6         2.2          2
1    5.0  2.3     3.3         1.0          1
2    4.9  2.5     4.5         1.7          2
3    4.9  3.1     1.5         0.1          0
4    5.7  3.8     1.7         0.3          0
..   ...  ...     ...         ...        ...
115  5.5  2.6     4.4         1.2          1
116  5.7  3.0     4.2         1.2          1
117  4.4  2.9     1.4         0.2          0
118  4.8  3.0     1.4         0.1          0
119  5.5  2.4     3.7         1.0          1

[120 rows x 5 columns]>


We will be using species as the label.

In [None]:
#pop off the species from the dataframe
train_y = train.pop('Species')
test_y = test.pop('Species')


Create the input function to generate a Dataset object.

In [None]:
#create input function
def input_fn(features,  labels, training=True, batch_size=256):
  dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

  if training:
    dataset = dataset.shuffle(1000).repeat()
  
  return dataset.batch(batch_size)

Set up the feature columns

In [None]:
feature_columns = []
for key in train.keys():
  feature_columns.append(tf.feature_column.numeric_column(key=key))

We can now build the classifier model using a Deep Neural Network type algorithm.

In [None]:
#create the model
classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns, 
                                        hidden_units=[30,10], #the two hidden layers will have 30 and 10 nodes, respectively
                                        n_classes=3) #the network ends by making a selection between three nodes

#train the model
classifier.train(lambda:input_fn(train, train_y, training=True), steps=5000)

eval_results = classifier.evaluate(lambda:input_fn(test, test_y, training=False))

clear_output()
print(eval_results)

{'accuracy': 0.56666666, 'average_loss': 0.60299075, 'loss': 0.60299075, 'global_step': 5000}


Now that the model is trained, we can make predictions based on the features. We will need a new input function since the labels are not being provided, they are being predicted.

In [None]:
3#prediction input function
def input_fn(features, batch_size=256):
  return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size) #returning a Dataset that has no labels

#list of the features used to predict the species
features = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
#empty dictionary for the predictions
predict = {}

#prompt the userr for values
print("enter the values as prompted.")

#colect the value for each feature
for feature in features:
  valid = False 
  while not valid:
    value = input(feature + ": ")
    if value.isdigit(): #make sure it is a digit
      valid = True #exit the while loop if it is a valid digit
  
  #add the feature value to the dictionary
  predict[feature] = [float(value)]

#predict the species based on our entered values
predictions = classifier.predict(lambda:input_fn(predict))

#look at the results
for result in predictions:
  class_id = result['class_ids'][0] #class_ids contains all predicted results. In this case it will only be one result for species, but it must be indexed regardless.
  probability = result['probabilities'][class_id] #probabilities contains a prediction value for each possible label (classification), index it to the highest value which is stored in class_ids

  #print the predicted value
  print('prediction is "{}" ({:.1f}%)'.format(SPECIES[class_id], 100 * probability))
  
  #print the probabilities of the other species
  for spec in result['all_class_ids']:
    if spec != class_id: #exclude the already predicted species
      probability = result['probabilities'][spec]
      print('the probability of it being "{}" is {:.1f}%'.format(SPECIES[spec], 100 * probability))
  

enter the values as prompted.
SepalLength: 10
SepalWidth: 15
PetalLength: 3
PetalWidth: 5
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmprn03d0ae/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
prediction is "Setosa" (99.7%)
the probability of it being "Versicolor" is 0.1%
the probability of it being "Virginica" is 0.2%


END OF CATEGORIZATION
___________________________________