In [1]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
scikit_learn_version = sklearn.__version__

scikit_learn_version

'0.21.2'

In [6]:
import os
print(os.listdir("./datasets"))

['.DS_Store', 'automobiles_file1.csv', 'automobiles_file2.csv', 'automobiles_test.csv', 'CarPrice_Assignment.csv', 'sentimental_analysis_data.csv', 'sentimental_data_evaluation.csv']


In [7]:
automobile_train = pd.read_csv('datasets/automobiles_file1.csv')

automobile_train.shape

(82, 52)

In [8]:
automobile_test = pd.read_csv('datasets/automobiles_test.csv')

automobile_test.shape

(41, 52)

In [9]:
x_train = automobile_train.drop('price', axis=1)
y_train = automobile_train['price']

In [10]:
x_test = automobile_test.drop('price', axis=1)
y_test = automobile_test['price']

In [11]:
x_train.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,114.2,198.9,68.4,58.7,3485,152,3.7,3.52,21.0,95,...,0,0,0,0,0,1,0,0,0,0
1,103.3,174.6,64.6,59.8,2535,122,3.35,3.46,8.5,88,...,0,0,0,1,0,0,0,0,0,0
2,93.7,167.3,63.8,50.8,2191,98,2.97,3.23,9.4,68,...,0,0,0,1,0,0,0,0,0,0
3,106.7,187.5,70.3,54.9,3495,183,3.58,3.64,21.5,123,...,0,0,0,0,0,1,0,0,0,0
4,95.9,173.2,66.3,50.2,2921,156,3.59,3.86,7.0,145,...,0,0,0,0,0,0,0,0,1,0


In [12]:
y_train.head()

0    17075.0
1     8921.0
2     7609.0
3    28176.0
4    14869.0
Name: price, dtype: float64

In [13]:
regressor_model = RandomForestRegressor(n_estimators=5, warm_start=True)
rfr_model = regressor_model.fit(x_train, y_train)

rfr_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=True)

In [14]:
training_score = rfr_model.score(x_train, y_train)

training_score

0.9560419627136922

In [15]:
y_pred = rfr_model.predict(x_test)

In [16]:
testing_score = r2_score(y_test, y_pred)

testing_score

0.8355941502751668

In [17]:
rfr_model_param = {}

rfr_model_param['model'] = rfr_model
rfr_model_param['sklearn_version'] = scikit_learn_version
rfr_model_param['r2_score'] = testing_score

In [18]:
rfr_model_param

{'model': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=True),
 'sklearn_version': '0.21.2',
 'r2_score': 0.8355941502751668}

In [19]:
import joblib

In [20]:
filename = 'models/rfr_model_checkpoint.joblib'

In [21]:
joblib.dump(rfr_model_param, filename)

['models/rfr_model_checkpoint.joblib']

In [22]:
joblib_model = joblib.load(filename)

In [23]:
joblib_model['model']

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=True)

In [24]:
joblib_model['sklearn_version']

'0.21.2'

In [25]:
joblib_model['model'].n_estimators = 15

joblib_model['model']

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=15,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=True)

In [26]:
automobile_retrain = pd.read_csv('datasets/automobiles_file2.csv')

automobile_retrain.shape

(82, 52)

In [27]:
x_train = automobile_retrain.drop('price', axis=1)
y_train = automobile_retrain['price']

In [28]:
rfr_retrained_model = joblib_model['model'].fit(x_train, y_train)

In [29]:
retrained_training_score = rfr_retrained_model.score(x_train, y_train)

retrained_training_score

0.9634483366522373

In [30]:
y_pred = rfr_retrained_model.predict(x_test)

In [31]:
retrained_testing_score = r2_score(y_test, y_pred)

retrained_testing_score

0.8934565749904839

In [32]:
rfr_model_param['r2_score']

0.8355941502751668

In [33]:
retrained_rfr_model_param = {}

retrained_rfr_model_param['model'] = rfr_retrained_model
retrained_rfr_model_param['sklearn_version'] = scikit_learn_version
retrained_rfr_model_param['r2_score'] = retrained_testing_score

In [34]:
filename = 'models/retrained_rfr_model_checkpoint.joblib'

In [35]:
joblib.dump(retrained_rfr_model_param, filename)

['models/retrained_rfr_model_checkpoint.joblib']