<a href="https://colab.research.google.com/github/spinosaphb/ufc-machine-learning/blob/main/activities/A2/02_Ativ_iris_B_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Study with the iris dataset 

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
path = 'https://raw.githubusercontent.com/spinosaphb/ufc-machine-learning/main/datasets/Iris.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


## Number of samples

In [3]:
samples = df['Species']
species = samples.unique() 
for sp in species:
    print(f'{sp}: {np.sum(samples == sp)}')

Iris-setosa: 50
Iris-versicolor: 50
Iris-virginica: 50


## Correlation between columns

In [4]:
df.drop('Id', axis=1).corr()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
SepalLengthCm,1.0,-0.109369,0.871754,0.817954
SepalWidthCm,-0.109369,1.0,-0.420516,-0.356544
PetalLengthCm,0.871754,-0.420516,1.0,0.962757
PetalWidthCm,0.817954,-0.356544,0.962757,1.0


In [5]:
reasonPetal = df['PetalLengthCm'] / df['PetalWidthCm']
df.insert(5, 'PetalReason', reasonPetal)
psLength = df['SepalLengthCm'] / df['PetalLengthCm']
df.insert(6, 'LengthReason', psLength)
petalSepalLW = df['PetalWidthCm'] / df['SepalLengthCm']
df.insert(7, 'PetalSepalReason', petalSepalLW)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,PetalReason,LengthReason,PetalSepalReason,Species
0,1,5.1,3.5,1.4,0.2,7.0,3.642857,0.039216,Iris-setosa
1,2,4.9,3.0,1.4,0.2,7.0,3.5,0.040816,Iris-setosa
2,3,4.7,3.2,1.3,0.2,6.5,3.615385,0.042553,Iris-setosa
3,4,4.6,3.1,1.5,0.2,7.5,3.066667,0.043478,Iris-setosa
4,5,5.0,3.6,1.4,0.2,7.0,3.571429,0.04,Iris-setosa


## Randomly separating into training and testing sets

In [6]:
training_data = df.sample(frac=0.8, random_state=42)
testing_data  = df.drop(training_data.index)

## Geting X_train, y_train, X_test, y_test

In [7]:
# Training
X_train = training_data.drop(['Id', 'Species'], axis=1)
y_train = training_data['Species']
# Testing
X_test = testing_data.drop(['Id', 'Species'], axis=1)
y_test = testing_data['Species']


## Boolean array of species

In [8]:
bool_species = y_train == species[0], \
               y_train == species[1], \
               y_train == species[2]

## Calculating the mean of each column of the dataframe by species

In [9]:
MeanFeaturesSpecies = []
for specie in range(3):
    FeaturesMean = []
    for column in range(X_train.shape[1]):
        col = X_train.columns[column]
        mean = np.mean(X_train[bool_species[specie]][col])
        FeaturesMean.append(mean)
    MeanFeaturesSpecies.append(FeaturesMean)
featuresMeans = np.array( MeanFeaturesSpecies)
featuresMeans

array([[4.98604651, 3.43488372, 1.46744186, 0.24651163, 7.01899225,
        3.43815358, 0.04912186],
       [5.92051282, 2.77179487, 4.26410256, 1.33846154, 3.22241234,
        1.39716977, 0.22605565],
       [6.60789474, 2.97631579, 5.55      , 2.04736842, 2.74903655,
        1.19363589, 0.31109717]])

Calculating difference between `featuresMeans` and `X_test` for each specie

In [10]:
np_X_test = np.array(X_test)
speciesDiff = []
for specie in range(3):
    diff = abs( np_X_test - featuresMeans[specie] )
    speciesDiff.append(diff)
speciesDiff

[array([[0.08604651, 0.43488372, 0.06744186, 0.04651163, 0.01899225,
         0.06184642, 0.00830553],
        [0.81395349, 0.56511628, 0.26744186, 0.04651163, 1.01899225,
         1.39517975, 0.0146391 ],
        [0.41395349, 0.03488372, 0.23255814, 0.04651163, 1.48100775,
         0.261683  , 0.01208482],
        [0.11395349, 0.26511628, 0.03255814, 0.15348837, 3.26899225,
         0.03815358, 0.02930952],
        [0.08604651, 0.33488372, 0.03255814, 0.14651163, 7.98100775,
         0.17148692, 0.02871369],
        [0.48604651, 1.13488372, 0.16744186, 0.05348837, 2.68565891,
         0.02338488, 0.01754481],
        [0.31395349, 0.26511628, 0.03255814, 0.04651163, 0.48100775,
         0.09517975, 0.01138601],
        [1.91395349, 0.33488372, 3.43255814, 1.25348837, 3.75232558,
         2.02999032, 0.16826945],
        [0.08604651, 1.03488372, 1.83255814, 0.75348837, 3.71899225,
         1.9533051 , 0.15495978],
        [1.61395349, 0.53488372, 3.13255814, 1.05348837, 3.48053071,
    

In [11]:
DataPred = []
row = 0; col = 0; specie = 0
for row in range(speciesDiff[0].shape[0]):
    rowPred = []
    for col in range(speciesDiff[0].shape[1]):
        minValue = speciesDiff[0][row][col]
        minIndex = 0
        for specie in range(3):
            value = speciesDiff[specie][row][col] 
            if(value < minValue):
                minValue = value
                minIndex = specie
        rowPred.append(species[minIndex])
    DataPred.append(rowPred)
df_DataPred = pd.DataFrame(DataPred)
df_DataPred

Unnamed: 0,0,1,2,3,4,5,6
0,Iris-setosa,Iris-virginica,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa
1,Iris-versicolor,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa
2,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa
3,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-versicolor,Iris-setosa,Iris-setosa
4,Iris-setosa,Iris-virginica,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa
5,Iris-setosa,Iris-versicolor,Iris-setosa,Iris-setosa,Iris-versicolor,Iris-setosa,Iris-setosa
6,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa
7,Iris-virginica,Iris-virginica,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor
8,Iris-setosa,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor
9,Iris-virginica,Iris-virginica,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor


## Extracting prediction

In [13]:
np_Data_Pred = np.array(df_DataPred)
y_pred = []
for i in range( np_Data_Pred.shape[0] ):
    mode = stats.mode(np_Data_Pred[i])[0][0]
    y_pred.append(mode)
y_pred

['Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica']

In [14]:
y_test

1          Iris-setosa
14         Iris-setosa
20         Iris-setosa
21         Iris-setosa
37         Iris-setosa
41         Iris-setosa
48         Iris-setosa
52     Iris-versicolor
57     Iris-versicolor
58     Iris-versicolor
71     Iris-versicolor
74     Iris-versicolor
87     Iris-versicolor
88     Iris-versicolor
90     Iris-versicolor
91     Iris-versicolor
92     Iris-versicolor
99     Iris-versicolor
102     Iris-virginica
103     Iris-virginica
106     Iris-virginica
107     Iris-virginica
116     Iris-virginica
121     Iris-virginica
124     Iris-virginica
129     Iris-virginica
130     Iris-virginica
140     Iris-virginica
144     Iris-virginica
149     Iris-virginica
Name: Species, dtype: object

## Showing percentage of correct predictions

In [15]:
hits = np.array(y_test) == np.array(y_pred)
elems = y_test.shape[0]
print(f'accuracy of: {round(np.sum(hits) / elems * 100, 2)}%')

accuracy of: 100.0%


In [16]:
hits

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])