In [1]:
import sklearn
sklearn.__version__

'0.23.2'

# Loading the Dataset
We're now going to load the iris dataset, which is a small dataset (150x4) 
that classifies the iris flower into three different type of irises.
(Setosa, Versicolour, and Virginica)

In [2]:
from sklearn.datasets import load_iris

In [4]:
iris_dataset = load_iris()
X = iris_dataset['data']
Y = iris_dataset['target']

In [5]:
X.shape

(150, 4)

In [6]:
Y.shape

(150,)

# Train Test Split
After loading the data, we must first split the dataset into a training set and a testing set.
The training set will be what the model will use, and we will evaluate the model's performance on the test set (data that the model will never see)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [10]:
X_train.shape

(120, 4)

In [11]:
X_test.shape

(30, 4)

In [12]:
Y_train.shape

(120,)

In [13]:
Y_test.shape

(30,)

# Preprocessing
We need to make sure that our dataset can be used for our machine learning models.

To do so, we're going to make sure that all features have a mean of 0, and a standard deviation of 1.

This is called standardization

In [14]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [15]:
ss.fit(X_train)

StandardScaler()

In [16]:
ss_X_train = ss.transform(X_train)
ss_X_test = ss.transform(X_test)

In [17]:
ss_X_train.mean(axis=0)

array([ 1.71344420e-15, -1.66579713e-15, -2.23894977e-16, -5.73615229e-17])

In [18]:
ss_X_train.std(axis=0)

array([1., 1., 1., 1.])

# Modeling
The models we will be using for classification are
- RandomForestClassifier
- LinearDiscriminantAnalysis
- DecisionTreeClassifier
- LogisticRegression
- KNeighborsClassifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [26]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
def evaluate_model(model,title):
    model.fit(ss_X_train,Y_train)
    Y_preds = model.predict(ss_X_test)
    print(f"Classification report for {title}")
    print(classification_report(Y_test,Y_preds))

In [27]:
all_models = [
    (RandomForestClassifier(),"random forest"),
    (LinearDiscriminantAnalysis(),"linear discriminant analysis"),
    (DecisionTreeClassifier(), "decision tree classifier"),
    (LogisticRegression(), "logistic regression"),
    (KNeighborsClassifier(), "k neighbors classifier")
]

for model, title in all_models:
    evaluate_model(model,title)

Classification report for random forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Classification report for linear discriminant analysis
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Classification report for decision tree classifier
              precision    recall  f1-score   support

           0       1.00      1.00    