## Load & prepare dataset

In [9]:
from sklearn import datasets
boston = datasets.load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [10]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

In [19]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [17]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [21]:
inputs = boston['data']
targets = boston['target']
target_names = ['MEDV']
feature_names = boston['feature_names']

In [22]:
import pandas as pd

X = pd.DataFrame(inputs, columns = feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [23]:
y = pd.DataFrame(targets, columns = ['Class'])
y.head()

Unnamed: 0,Class
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Data scaling & model implementation

In [25]:
from sklearn.preprocessing import StandardScaler

# Data scaler
sc = StandardScaler()

sc.fit(X_train)

X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

In [26]:
from sklearn.svm import SVC

# Hyperparameters of SVC
C = 1.0
kernel = 'rbf'
degree = 3
class_weight = None
decision_function_shape = 'ovr'
probability = True

# Support Vector Machine Classifier
svc = SVC(C = C, kernel = kernel, degree = degree, class_weight = class_weight, decision_function_shape = decision_function_shape, probability=probability)

In [27]:
import numpy as np
# First initial training
svc.fit(X_train_scaled, np.ravel(y_train))

ValueError: Unknown label type: 'continuous'

## Model performance on training dataset

In [17]:
from sklearn.metrics import classification_report

y_pred_train = svc.predict(X_train_scaled)

print('Train')
print(classification_report(y_train, y_pred_train, digits = 2, target_names=target_names))

Train
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        40
  versicolor       0.97      0.97      0.97        39
   virginica       0.98      0.98      0.98        41

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



## Hyperparameters tuning

In [18]:
X_train_scaled_v2, X_val_scaled, y_train_v2, y_val = train_test_split(X_train_scaled, y_train, test_size=0.3)

In [19]:
# Define ranges of hyperparameters
Cs = [0.1, 0.5, 1.0, 2.0, 5.0]
kernels = ['rbf', 'linear', 'poly', 'sigmoid']
degrees = [2, 3, 4, 5]
class_weights = [None, 'balanced']
decision_function_shapes = ['ovr', 'ovo']

# Train & validate model
# Input: hyperparameters + current best score achieved | ouput: model if balanced_accuracy is better else None
def train_validate_model(C, kernel, degree, class_weight, decision_function_shape, best_score):

    from sklearn.metrics import balanced_accuracy_score
    
    svc = SVC(C = C, kernel = kernel, degree = degree, class_weight = class_weight, decision_function_shape = decision_function_shape, probability=probability)
    svc.fit(X_train_scaled_v2, y_train_v2)

    y_pred = svc.predict(X_val_scaled)
    svc_score = balanced_accuracy_score(y_val, y_pred)
    params = {
        'C': C,
        'kernel': kernel, 
        'degree': degree, 
        'class_weight': class_weight,
        'decision_function_shape': decision_function_shape
    }
    
    return svc, svc_score, params

# Hypertune model
# Input: ranges of hyperparameters | output: best_model, best_score
# Remark: model is considered better if its accuracy on validation dataset is higher. If equal check the train scores.
def hypertune(Cs, kernels, degrees, class_weights, decision_function_shapes):
    
    from sklearn.base import clone
    from sklearn.metrics import balanced_accuracy_score

    best_svc = None
    best_score = 0
    best_params = None

    for C in Cs:
        for kernel in kernels:
            for degree in degrees:
                for class_weight in class_weights:
                    for decision_function_shape in decision_function_shapes:
                        
                        svc, svc_score, params = train_validate_model(C = C, kernel = kernel, degree = degree, class_weight = class_weight, decision_function_shape = decision_function_shape, best_score = best_score)
                        
                        if svc_score > best_score:
                            best_svc = svc
                            best_score = svc_score
                            best_params = params

                        if svc_score == best_score:

                            current_train_score = balanced_accuracy_score(y_train_v2, svc.predict(X_train_scaled_v2))
                            best_train_score = balanced_accuracy_score(y_train_v2, best_svc.predict(X_train_scaled_v2))

                            if current_train_score > best_train_score:

                                best_svc = svc
                                best_params = params


    return best_svc, best_score, best_params

best_svc, best_score, best_params = hypertune(Cs, kernels, degrees, class_weights, decision_function_shapes)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [20]:
best_svc, best_score, best_params

(SVC(class_weight='balanced', degree=2, kernel='linear', probability=True),
 1.0,
 {'C': 1.0,
  'kernel': 'linear',
  'degree': 2,
  'class_weight': 'balanced',
  'decision_function_shape': 'ovr'})

In [22]:
y_pred_train = best_svc.predict(X_train_scaled_v2)
y_pred_val = best_svc.predict(X_val_scaled)
y_pred_test = best_svc.predict(X_test_scaled)

print('Train')
print(classification_report(y_train_v2, y_pred_train, digits = 2, target_names=target_names))

print('Val')
print(classification_report(y_val, y_pred_val, digits = 2, target_names=target_names))

print('Test')
print(classification_report(y_test, y_pred_test, digits = 2, target_names=target_names))

Train
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        29
  versicolor       1.00      0.96      0.98        23
   virginica       0.97      1.00      0.98        32

    accuracy                           0.99        84
   macro avg       0.99      0.99      0.99        84
weighted avg       0.99      0.99      0.99        84

Val
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        11
  versicolor       1.00      1.00      1.00        16
   virginica       1.00      1.00      1.00         9

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

Test
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.91      0.91      0.91        11
   virginica       0.89      0.89      0.89         9

    a

In [23]:
from joblib import dump, load
dump(best_svc, 'svc.joblib')
dump(sc, 'scaler.joblib')

['scaler.joblib']