In [1]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
scikit_learn_version = sklearn.__version__
scikit_learn_version

'0.20.3'

In [3]:
automobile_train = pd.read_csv('datasets/automobiles_file1.csv')
automobile_train.shape

(82, 52)

In [4]:
automobile_test = pd.read_csv('datasets/automobiles_test.csv')
automobile_test.shape

(41, 52)

In [5]:
x_train = automobile_train.drop('price',axis=1)
y_train = automobile_train['price']

x_test = automobile_test.drop('price',axis=1)
y_test = automobile_test['price']

In [6]:
regressor_model = RandomForestRegressor(n_estimators=5, warm_start=True)

#warm_start will allowus to reuse model params. the model will be reloaded and then we can continue training

rfr_model = regressor_model.fit(x_train, y_train)
rfr_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=True)

In [7]:
training_score = rfr_model.score(x_train, y_train)
training_score

0.959035530209177

In [8]:
y_pred = rfr_model.predict(x_test)

In [9]:
testing_score = r2_score(y_test, y_pred)
testing_score

0.8363315069146953

In [10]:
rfr_model_param = {}
rfr_model_param['model'] = rfr_model
rfr_model_param['sklearn_version'] = scikit_learn_version
rfr_model_param['r2_score'] = testing_score


## you must also serialize any preprocessing on model, as the same preprocessing must also be performed on neew data
# rfr_model_param['preprocessing'] = tfidvect
## assuming we have performed a TFid vectorization of data, and call fit_transform on the loaded lodel -
#x_test_tran_new = joblin_model['preprocessing'].fit_transform(x_test)

In [11]:
rfr_model_param

{'model': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=True),
 'sklearn_version': '0.20.3',
 'r2_score': 0.8363315069146953}

In [12]:
import joblib

In [13]:
filename = 'models/rfr_model_checkpoint.joblib'
joblib.dump(rfr_model_param, filename)

['models/rfr_model_checkpoint.joblib']

In [14]:
joblib_model = joblib.load(filename)

In [15]:
joblib_model['model']

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=True)

In [16]:
#change n_estimators and continue training
joblib_model['model'].n_estimators = 15
joblib_model['model']

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=True)

In [17]:
auto_retrain = pd.read_csv('datasets/automobiles_file2.csv')
auto_retrain.shape

(82, 52)

In [18]:
x_train = auto_retrain.drop('price',axis=1)
y_train = auto_retrain['price']

In [19]:
rfr_retrained_model = joblib_model['model'].fit(x_train,y_train)

In [20]:
retrained_score = rfr_retrained_model.score(x_train, y_train)
retrained_score

0.9619723375575714

In [21]:
y_pred = rfr_retrained_model.predict(x_test)

retrained_testing_score = r2_score(y_test, y_pred)
retrained_score

0.9619723375575714

In [22]:
testing_score

0.8363315069146953

# Serializing pipeline

In [23]:
from sklearn.pipeline import Pipeline

### we can pipeline all steps from preprocessing to model in a single step as -
 clf_pipeline = Pipeline(Steps=[('tfid_vect', tfid_vect), ('classifier', classifier)]) <br>
 pipeline_model = clf_pipeline.fir(x_train, y_train)