### ISLP Chapter 8 Applied exercise 8
### BART (Bayesian Additive Regression Trees)

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
from sklearn.model_selection import train_test_split
from ISLP.bart import BART

In [3]:
# import Carseats data
file_name = '../../Data/Carseats.csv'
df = pd.read_csv(file_name).drop('SlNo', axis = 1)
df

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [4]:
# preprocessing steps
df.Sales = df.Sales.map(lambda x: 0 if x<=8 else 1)
df.ShelveLoc = pd.factorize(df.ShelveLoc)[0]
df.Urban = df.Urban.map({'No':0, 'Yes':1})
df.US = df.US.map({'No':0, 'Yes':1})

In [5]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,1,138,73,11,276,120,0,42,17,1,1
1,1,111,48,16,260,83,1,65,10,1,1
2,1,113,35,10,269,80,2,59,12,1,1
3,0,117,100,4,466,97,2,55,14,1,1
4,0,141,64,3,340,128,0,38,13,1,0


In [7]:
# split dataset into train, test
X = df.drop(['Sales'], axis = 1)
y = df.Sales

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.3, random_state = 0)

In [8]:
# instantiate BART()
bart_model = BART(random_state = 123, burnin = 5, ndraw = 20)

In [9]:
# fit BART()
bart_model.fit(X_train, y_train)

In [16]:
# MSE
yhat_test = bart_model.predict(X_test.to_numpy())
np.mean((y_test - yhat_test)**2)

0.14747406754090742

In [20]:
# variable importance
var_inclusion = pd.Series(bart_model.variable_inclusion_.mean(0), index = X.columns).sort_values(ascending=False)
var_inclusion

ShelveLoc      34.90
Price          34.40
CompPrice      31.15
Advertising    31.05
Age            31.00
Income         30.80
US             29.55
Education      29.30
Urban          29.25
Population     27.00
dtype: float64