In [1]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
scikit_learn_version = sklearn.__version__

In [3]:
scikit_learn_version

'0.22.1'

In [4]:
!ls datasets

automobiles_file1.csv  automobiles_test.csv	sentimental_analysis_data.csv
automobiles_file2.csv  CarPrice_Assignment.csv	sentimental_data_evaluation.csv


In [5]:
automobile_train = pd.read_csv('datasets/automobiles_file1.csv')
automobile_train.shape

(82, 52)

In [6]:
automobile_test = pd.read_csv('datasets/automobiles_test.csv')
automobile_test.shape

(41, 52)

In [7]:
x_train = automobile_train.drop(['price'], axis=1)
y_train = automobile_train['price']
x_test = automobile_test.drop(['price'], axis=1)
y_test = automobile_test['price']

In [8]:
regressor_model = RandomForestRegressor(n_estimators=5, warm_start=True)
rfr_model = regressor_model.fit(x_train, y_train)
rfr_model

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=5, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=True)

In [9]:
print('Training Score : ', rfr_model.score(x_train, y_train))

Training Score :  0.9500528216908191


In [10]:
y_pred = rfr_model.predict(x_test)
print('Testing Score: ', r2_score(y_test, y_pred))

Testing Score:  0.8164798563970149


In [11]:
rfr_model_param = {}

rfr_model_param['model'] = rfr_model
rfr_model_param['sklearn_version'] = scikit_learn_version
rfr_model_param['r2_score'] = r2_score(y_test, y_pred)

In [12]:
rfr_model_param

{'model': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=5, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=True),
 'sklearn_version': '0.22.1',
 'r2_score': 0.8164798563970149}

In [13]:
import joblib

In [14]:
filename = 'models/rfr_model_checkpoint.joblib'

In [15]:
joblib.dump(rfr_model_param, filename)

['models/rfr_model_checkpoint.joblib']

In [16]:
rfr_joblib_model = joblib.load(filename)
rfr_joblib_model['model']

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=5, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=True)

In [17]:
rfr_joblib_model['model'].n_estimators = 15
rfr_joblib_model['model']

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=15, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=True)

In [19]:
automobile_retrain = pd.read_csv('datasets/automobiles_file2.csv')

In [20]:
x_train = automobile_retrain.drop(['price'], axis=1)
y_train = automobile_retrain['price']

In [21]:
rfr_retrain_model = rfr_joblib_model['model'].fit(x_train, y_train)

In [22]:
print('Training Score: ', rfr_retrain_model.score(x_train, y_train))

Training Score:  0.9682959656171561


In [23]:
y_pred = rfr_retrain_model.predict(x_test)
print('Testing Score: ', r2_score(y_test, y_pred))

Testing Score:  0.8983765001760232


In [24]:
rfr_model_param['r2_score']

0.8164798563970149

In [25]:
retrained_rfr_model_param = {}

retrained_rfr_model_param['model'] = rfr_retrain_model
retrained_rfr_model_param['sklearn_version'] = scikit_learn_version
retrained_rfr_model_param['r2_score'] = r2_score(y_test, y_pred)
retrained_rfr_model_param

{'model': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=15, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=True),
 'sklearn_version': '0.22.1',
 'r2_score': 0.8983765001760232}

In [26]:
filename = 'models/retrained_rfr_model_checkpoint.joblib'

In [27]:
joblib.dump(retrained_rfr_model_param, filename)

['models/retrained_rfr_model_checkpoint.joblib']