In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
%matplotlib inline

#**Fetching Data**
---

In [2]:
def fetchData(linkToFile):
  return pd.read_csv(linkToFile)

In [3]:
dataset = fetchData("https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv")

#**A very basic classification, version-1.1.1**
---

In [4]:
column_names = list(dataset.columns)
print(column_names)

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']


In [5]:
def model_trainer(train_set, class_gen):
  '''
  Input:
  train_set: a numpy array, on which the model is to be created
  class_gen: a dicionary which intially has the class names and random number for their maximum and minimum value

  Output:
  class_gen: a dictionary that contains the limits for each class

  Process:
  a loop will iterate through the given numpy array. For each entry, it will match with its corresponding class in the dictionary and compare the length/width ranges and update accordingly.
  '''
  for flower in train_set:
    if (flower[0] > class_gen[flower[1]]["max"]):
      class_gen[flower[1]]["max"] = flower[0]
    elif (flower[0] < class_gen[flower[1]]["min"]):
      class_gen[flower[1]]["min"] = flower[0]

  return class_gen

In [6]:
def predict_class(class_gen, flower_feature):
  for probable_class, class_info in class_gen.items():
    if (flower_feature >= class_info["min"]) and (flower_feature <= class_info["max"]):
      return probable_class
  return "Nothing found"

In [7]:
def calculate_accuracy(validation_set, class_gen):
  '''
  Input:
  validation_set: a numpy array, which will be used to find classification accuracy
  class_gen: a dictionary of classification info

  Output:
  accuracy: a floating point number (denoting percentage)

  Process:
  A loop will iterate through the validation_set, predict the class from class_gen (using the predict_class() function) and match it with the given class.
  Count if the prediction was correct.
  At the end ofthe loop, just calculate the percentage and return.
  '''
  correct_prediction = 0
  for flower in validation_set:
    # flower[0] is the feature
    # flower[1] is the class
    if (flower[1] == predict_class(class_gen, flower[0])):
      correct_prediction += 1
    accuracy = (correct_prediction/len(validation_set))*100

  return accuracy

In [8]:
class_names = dataset[column_names[4]].unique()

Now, instead of running the same code for every feature, I am going to create a loop that will iterate through all the features and train, validate and test the data.

In [10]:
for feature in column_names[:4]:
  print("Working feature: {}".format(feature))
  class_gen = {}
  for class_name in class_names:
    class_gen[class_name] = {"min":10, "max":0.1}
  
  numpy_dataset = dataset[[feature, "species"]].to_numpy()
  train_set, validation_set = train_test_split(numpy_dataset, test_size=0.2, random_state=42)
  train_set, test_set = train_test_split(train_set, test_size=0.2, random_state=42)

  class_gen = model_trainer(train_set, class_gen)
  print(class_gen)

  print("Accuracy on validation set: {}".format(calculate_accuracy(validation_set, class_gen)))
  print("Accuracy on test set: {}".format(calculate_accuracy(test_set, class_gen)))

  # checking a random data
  feature_value = test_set[random.randint(0,len(test_set)-1)][0]
  print("Flower of petal length {} is {}".format(feature_value,predict_class(class_gen, feature_value)))
  print("\n---------------------------------------------------------------------------------\n")

Working feature: sepal_length
{'setosa': {'min': 4.3, 'max': 5.8}, 'versicolor': {'min': 4.9, 'max': 7.0}, 'virginica': {'min': 4.9, 'max': 7.7}}
Accuracy on validation set: 53.333333333333336
Accuracy on test set: 58.333333333333336
Flower of petal length 5.7 is setosa

---------------------------------------------------------------------------------

Working feature: sepal_width
{'setosa': {'min': 2.9, 'max': 4.4}, 'versicolor': {'min': 2.0, 'max': 3.2}, 'virginica': {'min': 2.2, 'max': 3.8}}
Accuracy on validation set: 53.333333333333336
Accuracy on test set: 58.333333333333336
Flower of petal length 3.4 is setosa

---------------------------------------------------------------------------------

Working feature: petal_length
{'setosa': {'min': 1.1, 'max': 1.9}, 'versicolor': {'min': 3.0, 'max': 5.0}, 'virginica': {'min': 4.5, 'max': 6.7}}
Accuracy on validation set: 93.33333333333333
Accuracy on test set: 87.5
Flower of petal length 1.3 is setosa

----------------------------------