# Model evaluation using K-Fold, Grid search and metrics

# 1. Load the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data= pd.read_csv(r'apple_stocks.csv')

# 2. Data Check

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,7-Jul-17,142.9,144.75,142.9,144.18,19201712
1,6-Jul-17,143.02,143.5,142.41,142.73,24128782
2,5-Jul-17,143.69,144.79,142.72,144.09,21569557
3,3-Jul-17,144.88,145.3,143.1,143.5,14277848
4,30-Jun-17,144.45,144.96,143.78,144.02,23024107


In [4]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume
124,9-Jan-17,117.95,119.43,117.94,118.99,33561948
125,6-Jan-17,116.78,118.16,116.47,117.91,31751900
126,5-Jan-17,115.92,116.86,115.81,116.61,22193587
127,4-Jan-17,115.85,116.51,115.75,116.02,21118116
128,3-Jan-17,115.8,116.33,114.76,116.15,28781865


In [5]:
data.shape

(129, 6)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    129 non-null    object 
 1   Open    129 non-null    float64
 2   High    129 non-null    float64
 3   Low     129 non-null    float64
 4   Close   129 non-null    float64
 5   Volume  129 non-null    int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 6.2+ KB


In [7]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,129.0,129.0,129.0,129.0,129.0
mean,139.826977,140.683953,139.050155,139.975349,27062590.0
std,11.007846,11.009166,10.705675,10.781804,12020480.0
min,115.8,116.33,114.76,116.02,14246350.0
25%,135.67,136.27,135.1,135.72,20350000.0
50%,142.11,142.92,141.11,142.27,23575090.0
75%,145.74,147.14,144.94,145.87,30379380.0
max,156.01,156.65,155.05,156.1,111985000.0


In [8]:
data.isnull().sum()

Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [9]:
data.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
124    False
125    False
126    False
127    False
128    False
Length: 129, dtype: bool

In [10]:
data.duplicated().sum()


np.int64(0)

# 3. Data Preprocessing

In [11]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [12]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2017-07-07,142.90,144.75,142.90,144.18,19201712
1,2017-07-06,143.02,143.50,142.41,142.73,24128782
2,2017-07-05,143.69,144.79,142.72,144.09,21569557
3,2017-07-03,144.88,145.30,143.10,143.50,14277848
4,2017-06-30,144.45,144.96,143.78,144.02,23024107
...,...,...,...,...,...,...
124,2017-01-09,117.95,119.43,117.94,118.99,33561948
125,2017-01-06,116.78,118.16,116.47,117.91,31751900
126,2017-01-05,115.92,116.86,115.81,116.61,22193587
127,2017-01-04,115.85,116.51,115.75,116.02,21118116


In [13]:
data['daily_returns'] = data.groupby('Date')

In [14]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
0,2017-07-07,142.9,144.75,142.9,144.18,19201712,"(2017-01-03 00:00:00, [Date])"
1,2017-07-06,143.02,143.5,142.41,142.73,24128782,"(2017-01-04 00:00:00, [Date])"
2,2017-07-05,143.69,144.79,142.72,144.09,21569557,"(2017-01-05 00:00:00, [Date])"
3,2017-07-03,144.88,145.3,143.1,143.5,14277848,"(2017-01-06 00:00:00, [Date])"
4,2017-06-30,144.45,144.96,143.78,144.02,23024107,"(2017-01-09 00:00:00, [Date])"


In [15]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
124,2017-01-09,117.95,119.43,117.94,118.99,33561948,"(2017-06-30 00:00:00, [Date])"
125,2017-01-06,116.78,118.16,116.47,117.91,31751900,"(2017-07-03 00:00:00, [Date])"
126,2017-01-05,115.92,116.86,115.81,116.61,22193587,"(2017-07-05 00:00:00, [Date])"
127,2017-01-04,115.85,116.51,115.75,116.02,21118116,"(2017-07-06 00:00:00, [Date])"
128,2017-01-03,115.8,116.33,114.76,116.15,28781865,"(2017-07-07 00:00:00, [Date])"


In [16]:
data.sort_values(by='daily_returns', ascending=True, inplace=True)

In [17]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
0,2017-07-07,142.9,144.75,142.9,144.18,19201712,"(2017-01-03 00:00:00, [Date])"
1,2017-07-06,143.02,143.5,142.41,142.73,24128782,"(2017-01-04 00:00:00, [Date])"
2,2017-07-05,143.69,144.79,142.72,144.09,21569557,"(2017-01-05 00:00:00, [Date])"
3,2017-07-03,144.88,145.3,143.1,143.5,14277848,"(2017-01-06 00:00:00, [Date])"
4,2017-06-30,144.45,144.96,143.78,144.02,23024107,"(2017-01-09 00:00:00, [Date])"


In [18]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
124,2017-01-09,117.95,119.43,117.94,118.99,33561948,"(2017-06-30 00:00:00, [Date])"
125,2017-01-06,116.78,118.16,116.47,117.91,31751900,"(2017-07-03 00:00:00, [Date])"
126,2017-01-05,115.92,116.86,115.81,116.61,22193587,"(2017-07-05 00:00:00, [Date])"
127,2017-01-04,115.85,116.51,115.75,116.02,21118116,"(2017-07-06 00:00:00, [Date])"
128,2017-01-03,115.8,116.33,114.76,116.15,28781865,"(2017-07-07 00:00:00, [Date])"


In [19]:
columns= list(data.columns)

In [20]:
columns


['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'daily_returns']

In [21]:
data=data.sort_values(by='Date')

In [22]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
128,2017-01-03,115.80,116.33,114.76,116.15,28781865,"(2017-07-07 00:00:00, [Date])"
127,2017-01-04,115.85,116.51,115.75,116.02,21118116,"(2017-07-06 00:00:00, [Date])"
126,2017-01-05,115.92,116.86,115.81,116.61,22193587,"(2017-07-05 00:00:00, [Date])"
125,2017-01-06,116.78,118.16,116.47,117.91,31751900,"(2017-07-03 00:00:00, [Date])"
124,2017-01-09,117.95,119.43,117.94,118.99,33561948,"(2017-06-30 00:00:00, [Date])"
...,...,...,...,...,...,...,...
4,2017-06-30,144.45,144.96,143.78,144.02,23024107,"(2017-01-09 00:00:00, [Date])"
3,2017-07-03,144.88,145.30,143.10,143.50,14277848,"(2017-01-06 00:00:00, [Date])"
2,2017-07-05,143.69,144.79,142.72,144.09,21569557,"(2017-01-05 00:00:00, [Date])"
1,2017-07-06,143.02,143.50,142.41,142.73,24128782,"(2017-01-04 00:00:00, [Date])"


In [23]:

data['daily_returns']= data['Close'].pct_change()

In [24]:
data['daily_returns']

128         NaN
127   -0.001119
126    0.005085
125    0.011148
124    0.009160
         ...   
4      0.002366
3     -0.003611
2      0.004111
1     -0.009439
0      0.010159
Name: daily_returns, Length: 129, dtype: float64

In [25]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
128,2017-01-03,115.80,116.33,114.76,116.15,28781865,
127,2017-01-04,115.85,116.51,115.75,116.02,21118116,-0.001119
126,2017-01-05,115.92,116.86,115.81,116.61,22193587,0.005085
125,2017-01-06,116.78,118.16,116.47,117.91,31751900,0.011148
124,2017-01-09,117.95,119.43,117.94,118.99,33561948,0.009160
...,...,...,...,...,...,...,...
4,2017-06-30,144.45,144.96,143.78,144.02,23024107,0.002366
3,2017-07-03,144.88,145.30,143.10,143.50,14277848,-0.003611
2,2017-07-05,143.69,144.79,142.72,144.09,21569557,0.004111
1,2017-07-06,143.02,143.50,142.41,142.73,24128782,-0.009439


In [26]:
data['daily_returns'].max()

0.06098063452822422

In [27]:
# scaling numerical features

In [28]:
data1 = data.copy()

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
scaler=MinMaxScaler()

data1['daily_returns']=data1['daily_returns'].fillna(data1['daily_returns'].mode()[0])

In [31]:
data1

Unnamed: 0,Date,Open,High,Low,Close,Volume,daily_returns
128,2017-01-03,115.80,116.33,114.76,116.15,28781865,-0.038777
127,2017-01-04,115.85,116.51,115.75,116.02,21118116,-0.001119
126,2017-01-05,115.92,116.86,115.81,116.61,22193587,0.005085
125,2017-01-06,116.78,118.16,116.47,117.91,31751900,0.011148
124,2017-01-09,117.95,119.43,117.94,118.99,33561948,0.009160
...,...,...,...,...,...,...,...
4,2017-06-30,144.45,144.96,143.78,144.02,23024107,0.002366
3,2017-07-03,144.88,145.30,143.10,143.50,14277848,-0.003611
2,2017-07-05,143.69,144.79,142.72,144.09,21569557,0.004111
1,2017-07-06,143.02,143.50,142.41,142.73,24128782,-0.009439


In [32]:
 data1 = data1.drop(columns=['Date'],axis=1)

In [33]:
data1


Unnamed: 0,Open,High,Low,Close,Volume,daily_returns
128,115.80,116.33,114.76,116.15,28781865,-0.038777
127,115.85,116.51,115.75,116.02,21118116,-0.001119
126,115.92,116.86,115.81,116.61,22193587,0.005085
125,116.78,118.16,116.47,117.91,31751900,0.011148
124,117.95,119.43,117.94,118.99,33561948,0.009160
...,...,...,...,...,...,...
4,144.45,144.96,143.78,144.02,23024107,0.002366
3,144.88,145.30,143.10,143.50,14277848,-0.003611
2,143.69,144.79,142.72,144.09,21569557,0.004111
1,143.02,143.50,142.41,142.73,24128782,-0.009439


In [34]:
scaler.fit(data1)

In [35]:
data_scaled=scaler.transform(data1)

In [36]:
data1

Unnamed: 0,Open,High,Low,Close,Volume,daily_returns
128,115.80,116.33,114.76,116.15,28781865,-0.038777
127,115.85,116.51,115.75,116.02,21118116,-0.001119
126,115.92,116.86,115.81,116.61,22193587,0.005085
125,116.78,118.16,116.47,117.91,31751900,0.011148
124,117.95,119.43,117.94,118.99,33561948,0.009160
...,...,...,...,...,...,...
4,144.45,144.96,143.78,144.02,23024107,0.002366
3,144.88,145.30,143.10,143.50,14277848,-0.003611
2,143.69,144.79,142.72,144.09,21569557,0.004111
1,143.02,143.50,142.41,142.73,24128782,-0.009439


# Find the Independent Variable and dependent avariable

In [37]:
data1

Unnamed: 0,Open,High,Low,Close,Volume,daily_returns
128,115.80,116.33,114.76,116.15,28781865,-0.038777
127,115.85,116.51,115.75,116.02,21118116,-0.001119
126,115.92,116.86,115.81,116.61,22193587,0.005085
125,116.78,118.16,116.47,117.91,31751900,0.011148
124,117.95,119.43,117.94,118.99,33561948,0.009160
...,...,...,...,...,...,...
4,144.45,144.96,143.78,144.02,23024107,0.002366
3,144.88,145.30,143.10,143.50,14277848,-0.003611
2,143.69,144.79,142.72,144.09,21569557,0.004111
1,143.02,143.50,142.41,142.73,24128782,-0.009439


In [38]:
data1['daily_returns'].max()

0.06098063452822422

In [39]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129 entries, 128 to 0
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           129 non-null    float64
 1   High           129 non-null    float64
 2   Low            129 non-null    float64
 3   Close          129 non-null    float64
 4   Volume         129 non-null    int64  
 5   daily_returns  129 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 7.1 KB


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129 entries, 128 to 0
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           129 non-null    datetime64[ns]
 1   Open           129 non-null    float64       
 2   High           129 non-null    float64       
 3   Low            129 non-null    float64       
 4   Close          129 non-null    float64       
 5   Volume         129 non-null    int64         
 6   daily_returns  128 non-null    float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 8.1 KB


In [41]:
X = data1.drop(columns = ["daily_returns"],axis=1)
y = data1['daily_returns']

# Split the dataset into training and testing sets

In [42]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [43]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [44]:
X_train.shape, y_train.shape

((103, 5), (103,))

In [45]:
X_test.shape, y_test.shape

((26, 5), (26,))

# Define Random forest Classifier 

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
model_rf = RandomForestRegressor(n_estimators=350, oob_score=True)
# oob score is for validation data

# Train the model

In [48]:
model_rf.fit(X_train,y_train)

In [49]:
y_pred = model_rf.predict(X_test)

In [50]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [51]:
mse = mean_squared_error(y_test,y_pred)
print(f'mse score:{mse}')

mse score:0.00014938618778390637


In [52]:
mean_absolute_error(y_test,y_pred)

0.007129986690180284

In [53]:
r2_score(y_test,y_pred)

-0.9445106979824665

In [54]:
model_rf = RandomForestRegressor(n_estimators=150,max_depth=2,min_samples_split=2,min_samples_leaf=3,max_features='sqrt',criterion='absolute_error',max_leaf_nodes=3, oob_score=True)


# Hyperparameters

In [55]:
model_rf.fit(X_train,y_train)

In [56]:
y_pred1 = model_rf.predict(X_test)

In [57]:
mse = mean_squared_error(y_test,y_pred1)
print(f'mse score:{mse}')

mse score:8.835910464974987e-05


In [58]:
mean_absolute_error(y_test,y_pred1)

0.005880501080864813

In [59]:
r2_score(y_test,y_pred1)

-0.1501413002394112

# Hyper Parametertuning using grid search

In [78]:
parameter_grid={
        'criterion':["friedman_mse","absolute_error","poisson","squared_error"],
        'max_depth': [None, 10, 15,20,30,40 ],
        'min_samples_split':[2,5,7,10],
        'min_samples_leaf':[2,3,4],
        'max_leaf_nodes':[4],
        'n_estimators':[50,100,200]
}

In [79]:
from sklearn.model_selection import GridSearchCV

In [80]:
grid_search = GridSearchCV(
    estimator=model_rf,
    param_grid=parameter_grid,
    cv=5,
    scoring='accuracy',
    n_jobs= -1,
    verbose=2
)

In [81]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


1080 fits failed out of a total of 4320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rites\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\rites\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\rites\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 407, in fit
    raise ValueError(
    ...<2 lines>...
    )
ValueError: Some value(s) of y are negative which is not allowed for Pois

In [82]:
# get the best parameter 

In [83]:
best_params= grid_search.best_params_
print('best hyperparameter:',best_params)

best hyperparameter: {'criterion': 'friedman_mse', 'max_depth': None, 'max_leaf_nodes': 4, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}


# Cross validation

In [84]:
from sklearn.model_selection import cross_val_score

In [85]:
# fold
k=5

In [86]:
# perform K-fold cross validation
cv_scores = cross_val_score(model_rf,X,y,cv=k,verbose=2)

[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.6s


In [87]:
for i in range(k):
    print(f'{i+1} Fold,cross validation accuracy is {cv_scores[i]}' )

1 Fold,cross validation accuracy is -0.16327749658155177
2 Fold,cross validation accuracy is 0.06932133852584588
3 Fold,cross validation accuracy is -0.032687878584637176
4 Fold,cross validation accuracy is -0.05317712589443735
5 Fold,cross validation accuracy is -0.1452123866010253


In [88]:
print(f'Mean Accuracy is {cv_scores.mean():.2f}')

Mean Accuracy is -0.07


In [89]:
print(f'Standard Deviation: {cv_scores.std():.2f}')

Standard Deviation: 0.08
