## Load & prepare dataset

In [1]:
from sklearn import datasets
iris = datasets.load_iris()

In [2]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
inputs = iris['data']
targets = iris['target']
target_names = iris['target_names']
feature_names = iris['feature_names']

In [5]:
import pandas as pd

X = pd.DataFrame(inputs, columns = feature_names)
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
y = pd.DataFrame(targets, columns = ['Class'])
y.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Data scaling & model implementation

In [10]:
from sklearn.preprocessing import StandardScaler

# Data scaler
sc = StandardScaler()

sc.fit(X_train)

X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

In [13]:
from sklearn.svm import SVC

# Hyperparameters of SVC
C = 1.0
kernel = 'rbf'
degree = 3
class_weight = None
decision_function_shape = 'ovr'
probability = True

# Support Vector Machine Classifier
svc = SVC(C = C, kernel = kernel, degree = degree, class_weight = class_weight, decision_function_shape = decision_function_shape, probability=probability)

In [16]:
import numpy as np
# First initial training
svc.fit(X_train_scaled, np.ravel(y_train))

## Model performance on training dataset

In [17]:
from sklearn.metrics import classification_report

y_pred_train = svc.predict(X_train_scaled)

print('Train')
print(classification_report(y_train, y_pred_train, digits = 2, target_names=target_names))

Train
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        40
  versicolor       0.97      0.97      0.97        39
   virginica       0.98      0.98      0.98        41

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



## Hyperparameters tuning

In [18]:
X_train_scaled_v2, X_val_scaled, y_train_v2, y_val = train_test_split(X_train_scaled, y_train, test_size=0.3)

In [19]:
# Define ranges of hyperparameters
Cs = [0.1, 0.5, 1.0, 2.0, 5.0]
kernels = ['rbf', 'linear', 'poly', 'sigmoid']
degrees = [2, 3, 4, 5]
class_weights = [None, 'balanced']
decision_function_shapes = ['ovr', 'ovo']

# Train & validate model
# Input: hyperparameters + current best score achieved | ouput: model if balanced_accuracy is better else None
def train_validate_model(C, kernel, degree, class_weight, decision_function_shape, best_score):

    from sklearn.metrics import balanced_accuracy_score
    
    svc = SVC(C = C, kernel = kernel, degree = degree, class_weight = class_weight, decision_function_shape = decision_function_shape, probability=probability)
    svc.fit(X_train_scaled_v2, y_train_v2)

    y_pred = svc.predict(X_val_scaled)
    svc_score = balanced_accuracy_score(y_val, y_pred)
    params = {
        'C': C,
        'kernel': kernel, 
        'degree': degree, 
        'class_weight': class_weight,
        'decision_function_shape': decision_function_shape
    }
    
    return svc, svc_score, params

# Hypertune model
# Input: ranges of hyperparameters | output: best_model, best_score
# Remark: model is considered better if its accuracy on validation dataset is higher. If equal check the train scores.
def hypertune(Cs, kernels, degrees, class_weights, decision_function_shapes):
    
    from sklearn.base import clone
    from sklearn.metrics import balanced_accuracy_score

    best_svc = None
    best_score = 0
    best_params = None

    for C in Cs:
        for kernel in kernels:
            for degree in degrees:
                for class_weight in class_weights:
                    for decision_function_shape in decision_function_shapes:
                        
                        svc, svc_score, params = train_validate_model(C = C, kernel = kernel, degree = degree, class_weight = class_weight, decision_function_shape = decision_function_shape, best_score = best_score)
                        
                        if svc_score > best_score:
                            best_svc = svc
                            best_score = svc_score
                            best_params = params

                        if svc_score == best_score:

                            current_train_score = balanced_accuracy_score(y_train_v2, svc.predict(X_train_scaled_v2))
                            best_train_score = balanced_accuracy_score(y_train_v2, best_svc.predict(X_train_scaled_v2))

                            if current_train_score > best_train_score:

                                best_svc = svc
                                best_params = params


    return best_svc, best_score, best_params

best_svc, best_score, best_params = hypertune(Cs, kernels, degrees, class_weights, decision_function_shapes)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [20]:
best_svc, best_score, best_params

(SVC(class_weight='balanced', degree=2, kernel='linear', probability=True),
 1.0,
 {'C': 1.0,
  'kernel': 'linear',
  'degree': 2,
  'class_weight': 'balanced',
  'decision_function_shape': 'ovr'})

In [22]:
y_pred_train = best_svc.predict(X_train_scaled_v2)
y_pred_val = best_svc.predict(X_val_scaled)
y_pred_test = best_svc.predict(X_test_scaled)

print('Train')
print(classification_report(y_train_v2, y_pred_train, digits = 2, target_names=target_names))

print('Val')
print(classification_report(y_val, y_pred_val, digits = 2, target_names=target_names))

print('Test')
print(classification_report(y_test, y_pred_test, digits = 2, target_names=target_names))

Train
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        29
  versicolor       1.00      0.96      0.98        23
   virginica       0.97      1.00      0.98        32

    accuracy                           0.99        84
   macro avg       0.99      0.99      0.99        84
weighted avg       0.99      0.99      0.99        84

Val
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        11
  versicolor       1.00      1.00      1.00        16
   virginica       1.00      1.00      1.00         9

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

Test
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.91      0.91      0.91        11
   virginica       0.89      0.89      0.89         9

    a

In [23]:
from joblib import dump, load
dump(best_svc, 'svc.joblib')
dump(sc, 'scaler.joblib')

['scaler.joblib']