In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

In [34]:
df = pd.read_csv('../data/Advertising.csv')
X = df.drop('sales',axis=1)
y = df['sales']
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [7]:
# Train | Test | HoldOut
#  70%  | 15%  | 15%
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)
X_validation,X_holdout_test,y_validation,y_holdout_test = train_test_split(X_test,y_test,test_size=0.5,random_state=101)

In [22]:
# model training...
model = RandomForestRegressor(n_estimators=20,random_state=101)
model.fit(X_train,y_train)

In [23]:
# model evaluation
validation_pred = model.predict(X_validation)
mse = mean_absolute_error(y_validation,validation_pred) # MSE
rmse = np.sqrt(mean_squared_error(y_validation,validation_pred)) # RMSE
print(mse)
print(rmse)

0.6940000000000006
0.8599040644164908


In [24]:
# final performance on hold out test
holdout_pred = model.predict(X_holdout_test)
holdout_mse = mean_absolute_error(y_holdout_test,holdout_pred) # MSE
holdout_rmse = np.sqrt(mean_squared_error(y_holdout_test,holdout_pred)) # RMSE
print(holdout_mse)
print(holdout_rmse)

0.5761666666666669
0.71244356501644


In [26]:
final_model = RandomForestRegressor(n_estimators=20,random_state=101)
final_model.fit(X,y)

In [27]:
joblib.dump(final_model,'final_model.pkl')

['final_model.pkl']

In [29]:
list(X.columns)

['TV', 'radio', 'newspaper']

In [30]:
joblib.dump(list(X.columns),'col_names.pkl')

['col_names.pkl']

In [32]:
# loading the model
n_cols = joblib.load('col_names.pkl')
n_cols

['TV', 'radio', 'newspaper']

In [33]:
lm = joblib.load('final_model.pkl')
lm.predict([[230.1,37.8,69.2]])



array([22.01])