In [33]:
import pandas as pd

# Copy the file path below
data_file_path = "\\Users\\HP\\Desktop\\MANN\\CODING\\PYTHON\\Weekly.csv"


# Read the file
file_data = pd.read_csv(data_file_path)
print(file_data)
# Describes the each column of the file in 8 sections:
##  Count: shows how many rows have non-missing values
##  Mean: average of data
##  Std: standard deviation, how numerically spread out the values are
##  Min: lowest value
##  Max: highest value
## 25%, 50%, 75% (percentiles)
file_data.describe()

# Obtain a list of all columns in the dataset
file_data.columns

# dropna drops missing values in the column
file_data = file_data.dropna(axis=0)

# Select the predicting target (column) where Column in the name of the target
y = file_data.Today

# Choose features (columns that are later used to make predictions)
data_features = ['Year',"Lag1",'Lag2','Lag3','Lag4','Lag5','Volume']
X = file_data[data_features]

# Review data used to make predictions
X.describe()
X.head()

      rownames  Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today  \
0            1  1990  0.816  1.572 -3.936 -0.229 -3.484  0.154976 -0.270   
1            2  1990 -0.270  0.816  1.572 -3.936 -0.229  0.148574 -2.576   
2            3  1990 -2.576 -0.270  0.816  1.572 -3.936  0.159837  3.514   
3            4  1990  3.514 -2.576 -0.270  0.816  1.572  0.161630  0.712   
4            5  1990  0.712  3.514 -2.576 -0.270  0.816  0.153728  1.178   
...        ...   ...    ...    ...    ...    ...    ...       ...    ...   
1084      1085  2010 -0.861  0.043 -2.173  3.599  0.015  3.205160  2.969   
1085      1086  2010  2.969 -0.861  0.043 -2.173  3.599  4.242568  1.281   
1086      1087  2010  1.281  2.969 -0.861  0.043 -2.173  4.835082  0.283   
1087      1088  2010  0.283  1.281  2.969 -0.861  0.043  4.454044  1.034   
1088      1089  2010  1.034  0.283  1.281  2.969 -0.861  2.707105  0.069   

     Direction  
0         Down  
1         Down  
2           Up  
3           Up  
4 

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728


In [34]:
# Building your model

# Use scikit-learn library to create models
# Steps to building and using a model:
## Define: what type of model is it? Other parameters of the model type are specified too
## Fit: Capture patterns from provided data
## Predict
## Evaluate: determine accuracy of the model's predictions

# Example of decision tree model with scikit-learn and feature being target variables
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
data_model = DecisionTreeRegressor(random_state=1)

# Fit model
data_model.fit(X,y)

# Using the fitted model above, we can use it now to make predictions
print("Making predictions for the following ---")
print(X.head())
print("The predictions are:")
print(data_model.predict(X.head()))

Making predictions for the following ---
   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume
0  1990  0.816  1.572 -3.936 -0.229 -3.484  0.154976
1  1990 -0.270  0.816  1.572 -3.936 -0.229  0.148574
2  1990 -2.576 -0.270  0.816  1.572 -3.936  0.159837
3  1990  3.514 -2.576 -0.270  0.816  1.572  0.161630
4  1990  0.712  3.514 -2.576 -0.270  0.816  0.153728
The predictions are:
[-0.27  -2.576  3.514  0.712  1.178]


In [35]:
# Model Validation

# Once you have a model, calculate the mean absolute error

from sklearn.metrics import mean_absolute_error

predicted_value = data_model.predict(X)
mean_absolute_error(y, predicted_value)

# Problem with 'In-Sample' Scores:
## Data might not be accurate when model sees new data
## To fix this, we can exclude some data from the model-building
## process, and use those to test the model's accuracy on data it
## hasn't seen before. This data is called validation data.

# Use the train_test_split to break up the data into two pieces
from sklearn.model_selection import train_test_split

# Numerical value to random_state argument guarantees we get the same split
# every time we run the script

train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)

# Define model
data_model = DecisionTreeRegressor(random_state=1)

# Fit model
data_model.fit(train_X,train_y)

# get predicted values on validation data
val_predictions = data_model.predict(val_X)
print(val_predictions)
print(val_X)
print(mean_absolute_error(val_y, val_predictions))

[ -0.261  -0.261   2.03    1.892   1.671   0.509   0.739  -0.461  -2.425
   3.854  -1.002   1.207  -3.041  -0.869  -0.404  -1.393  -2.029  -2.496
   1.789  -0.936   0.165   0.174  -1.089   0.238   0.748  -0.114  -0.748
   2.05    1.458  -1.693  -0.155   0.123   1.458   1.458  -1.897  -1.875
   2.26    1.36    0.6    -0.78    1.719   0.951   2.427  -0.342   0.34
   3.085  10.707   0.528   0.773  -0.185  -0.061  -1.321  -0.8    -5.184
   0.748  -3.897   4.195  -1.218   3.169  -1.81   -0.709   2.102   1.384
  -2.76   -4.217  -4.034   0.6     1.499   0.555  -1.213   0.853   5.173
  -2.496   3.75   -0.261  -0.637   2.48    0.555   1.147   0.72    1.499
   0.971   0.555   0.65   -4.034   1.892   1.65   -0.061   1.171  -1.601
   3.076   1.508   0.015  -3.898   0.159  -0.23   -1.348  -0.155   1.813
  -0.062  -1.433   0.951   0.872   1.148   0.63   -2.349   0.958  -1.147
   1.813  -0.897  -0.561  -2.451  -0.218   2.509   1.156   3.289   2.11
  -0.557 -11.05   -0.255   0.872   1.112  -0.23   -0.

In [36]:
# Underfitting and Overfitting

# Overfitting: where a model matches the training data almost perfectly, but does poorly
#              in validation and other new data.
# Underfitting: when a model fails to capture distinctions and patterns in the data, so it 
#               performs poorly even in training data.

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5,50,500,5000,10000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  1
Max leaf nodes: 50  		 Mean Absolute Error:  2
Max leaf nodes: 500  		 Mean Absolute Error:  2
Max leaf nodes: 5000  		 Mean Absolute Error:  2
Max leaf nodes: 10000  		 Mean Absolute Error:  2


In [37]:
# Random Forests

# The random forest uses many trees, and it makes a prediction by averaging the predictions of 
# each component tree. It generally has much better predictive accuracy than a single decision tree 
# and it works well with default parameters.

# Build a random forest scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

1.9187684615384617
