In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn import set_config
set_config(display = 'diagram')

In [42]:
df = pd.read_csv('gld_price_data.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [43]:
df = pd.read_csv('gld_price_data.csv', parse_dates = ['Date'])

In [44]:
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

In [45]:
df

Unnamed: 0,Date,SPX,GLD,USO,SLV,EUR/USD,Month,Year
0,2008-01-02,1447.160034,84.860001,78.470001,15.1800,1.471692,1,2008
1,2008-01-03,1447.160034,85.570000,78.370003,15.2850,1.474491,1,2008
2,2008-01-04,1411.630005,85.129997,77.309998,15.1670,1.475492,1,2008
3,2008-01-07,1416.180054,84.769997,75.500000,15.0530,1.468299,1,2008
4,2008-01-08,1390.189941,86.779999,76.059998,15.5900,1.557099,1,2008
...,...,...,...,...,...,...,...,...
2285,2018-05-08,2671.919922,124.589996,14.060000,15.5100,1.186789,5,2018
2286,2018-05-09,2697.790039,124.330002,14.370000,15.5300,1.184722,5,2018
2287,2018-05-10,2723.070068,125.180000,14.410000,15.7400,1.191753,5,2018
2288,2018-05-14,2730.129883,124.489998,14.380000,15.5600,1.193118,5,2018


In [46]:
df = df.drop(columns = ['Date'])

In [47]:
df.isnull().sum()

SPX        0
GLD        0
USO        0
SLV        0
EUR/USD    0
Month      0
Year       0
dtype: int64

In [48]:
X = df.drop(columns = ['GLD'])
y = df['GLD']

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [50]:
X_train.head()

Unnamed: 0,SPX,USO,SLV,EUR/USD,Month,Year
486,1165.810059,39.560001,16.6,1.356705,3,2010
31,1349.98999,75.93,16.952,1.46761,2,2008
1754,2041.890015,10.74,13.08,1.087453,12,2015
1490,1862.76001,31.1,16.67,1.280442,10,2014
1526,2026.140015,23.280001,16.370001,1.239495,12,2014


In [51]:
numeric_feature = X_train.select_dtypes('number').columns
categorical_features = X_train.select_dtypes('object').columns

In [52]:
numeric_transformer = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())
categorical_transformer = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = "missing"), OneHotEncoder())

In [53]:
col_transformer = make_column_transformer(
  (numeric_transformer, numeric_feature), 
    (categorical_transformer, categorical_features),
remainder = 'passthrough')

In [54]:
col_transformer.fit(X_train)

In [55]:
pipe = make_pipeline(col_transformer, DecisionTreeRegressor())

In [56]:
pipe.fit(X_train, y_train)

In [57]:
from sklearn.model_selection import cross_validate
with_categorical_score = cross_validate(pipe, X_train, y_train, return_train_score = True)

In [58]:
categorical_score = pd.DataFrame(with_categorical_score)
categorical_score

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.013843,0.014057,0.987641,1.0
1,0.017448,0.003998,0.992729,1.0
2,0.01199,0.001362,0.990606,1.0
3,0.015625,0.0,0.991719,1.0
4,0.01562,0.0,0.994251,1.0


In [59]:
pipe.predict(X_test)

array([126.449997, 116.470001, 140.339996, 119.889999,  93.459999,
       151.410004, 117.75    , 113.650002, 154.669998, 105.370003,
       102.839996, 120.160004,  92.239998, 161.199997, 122.129997,
       117.120003,  86.610001,  92.349998,  93.459999, 127.010002,
        81.360001, 117.339996, 127.089996, 174.399994, 167.119995,
       138.210007, 113.639999, 159.300003, 134.610001, 114.769997,
       110.860001, 104.720001, 127.599998, 122.489998,  98.900002,
       121.290001,  82.800003, 115.68    , 127.959999, 140.339996,
       103.419998, 131.160004, 107.970001, 134.660004, 133.110001,
       148.910004, 119.730003, 157.639999, 157.580002, 128.110001,
       124.690002, 111.459999, 123.800003, 117.540001, 153.050003,
       121.559998, 111.419998,  87.470001, 156.479996, 117.980003,
        87.989998, 114.5     ,  84.279999, 146.869995, 123.389999,
       127.059998, 125.18    , 120.389999, 168.020004, 121.190002,
       126.080002, 173.589996, 121.559998, 166.610001, 119.599

In [60]:
pipe.score(X_train, y_train)

1.0

In [61]:
pipe.score(X_test, y_test)

0.9934088760238098