# Train sklearn SVR model on Boston dataset and predict

## Setup Environment

In [None]:
!pip install -q -U pip
!pip install -q scikit-learn==0.24.1
!pip install -q joblib

In [None]:
import logging
import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)

## Prepare data
We load the Boston dataset from sklearn and split it into train and test sets

In [None]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

## Train

In [None]:
def generate_model(filename):
    logging.info('preparing train and test datasets')
    features = 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT'
    X_train = trainX[features.split()]
    X_test = testX[features.split()]
    y_train = trainX['target']
    y_test = testX['target']

    # TRAIN
    logging.info('training model')
    model = LinearRegression()

    model.fit(X_train, y_train)

    # ABS ERROR AND COUPLE PERF METRICS
    logging.info('evaluating model')
    abs_err = np.abs(model.predict(X_test) - y_test)

    for q in [10, 50, 90]:
        logging.info(f'AE-at-{q}th-percentile: {np.percentile(a=abs_err, q=q)}')

    # SAVE MODEL
    logging.info(f'saving model binary: {filename}.pkl')
    joblib.dump(model, f'{filename}.pkl')

In [None]:
# train model
generate_model('linear_regressor')

## Predict

In [None]:
def model_predict(filename, data):
    # LOAD MODEL
    model = joblib.load(filename)
    # PREDICT
    predictions = model.predict([data])
    logging.info(f'predictions: {predictions}')

In [None]:
# Load test data and predict on first row
features = 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT'
X_test = testX[features.split()]

model_predict('linear_regressor.pkl', X_test.iloc[0])