# Predicting stock and stock price index movement of AAPL using Random Forest Prediction Model

## Importing Libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (7,4.5)

import numpy as np
import random
np.random.seed(42)
random.seed(42)

import pandas_technical_indicators as ta

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, precision_score, confusion_matrix, recall_score, accuracy_score
from sklearn.model_selection import train_test_split


## Data
The dataset folder consists of csv files of each stock containing stock prices over a span of 10 years in OHLC Format

In [2]:
aapl = pd.read_csv('Datasets/AAPL/AAPL.csv')

In [3]:
aapl.head()

Unnamed: 0,date,close,volume,open,high,low
0,16:00,221.19,22762391.0,222.15,222.64,219.34
1,2018/10/17,221.19,22692880.0,222.3,222.64,219.34
2,2018/10/16,222.15,28802550.0,218.93,222.99,216.7627
3,2018/10/15,217.36,30280450.0,221.16,221.83,217.27
4,2018/10/12,222.11,39494770.0,220.42,222.88,216.84


In [4]:
#We don't need the dates so we can drop it.

In [5]:
del(aapl['date'])

In [6]:
aapl.head()


Unnamed: 0,close,volume,open,high,low
0,221.19,22762391.0,222.15,222.64,219.34
1,221.19,22692880.0,222.3,222.64,219.34
2,222.15,28802550.0,218.93,222.99,216.7627
3,217.36,30280450.0,221.16,221.83,217.27
4,222.11,39494770.0,220.42,222.88,216.84


---

## Exponential Smoothing 

Exponential smoothing is a time series forecasting method for univariate data.
In other words, recent observations are given relatively more weight in forecasting than the older observations.

In [7]:
def get_exp_preprocessing(df,alpha=0.9):
    edata = df.ewm(alpha=alpha).mean()
    return edata

In [8]:
smoothed_aapl = get_exp_preprocessing(aapl)
#smoothed_aapl means appl dataset that has smoothened
smoothed_aapl.head()


Unnamed: 0,close,volume,open,high,low
0,221.19,22762391.0,222.15,222.64,219.34
1,221.19,22692880.0,222.286364,222.64,219.34
2,222.054865,28802550.0,219.262613,222.955315,217.018108
3,217.829064,30280450.0,220.970432,221.94243,217.244833
4,221.681945,39494770.0,220.475038,222.786251,216.88048


---

## Feature Extraction

In [9]:
def feature_extraction(data):
    for x in [5, 14, 26, 44, 66]:
        data = ta.relative_strength_index(data, n=x)
        data = ta.stochastic_oscillator_d(data, n=x)
        data = ta.accumulation_distribution(data, n=x)
        data = ta.average_true_range(data, n=x)
        data = ta.momentum(data, n=x)
        data = ta.money_flow_index(data, n=x)
        data = ta.rate_of_change(data, n=x)
        data = ta.on_balance_volume(data, n=x)
        data = ta.commodity_channel_index(data, n=x)
        data = ta.ease_of_movement(data, n=x)
        data = ta.trix(data, n=x)
        data = ta.vortex_indicator(data, n=x)
    
    data['ema50'] = data['close'] / data['close'].ewm(50).mean()
    data['ema21'] = data['close'] / data['close'].ewm(21).mean()
    data['ema14'] = data['close'] / data['close'].ewm(14).mean()
    data['ema5'] = data['close'] / data['close'].ewm(5).mean()
        
    #Williams %R is missing
    data = ta.macd(data, n_fast=12, n_slow=26)
    
    del(data['open'])
    del(data['high'])
    del(data['low'])
    del(data['volume'])
    
    return data
   
def compute_prediction_int(df, n):
    pred = (df.shift(-n)['close'] >= df['close'])
    pred = pred.iloc[:-n]
    return pred.astype(int)

def prepare_data(df, horizon):
    data = feature_extraction(df).dropna().iloc[:-horizon]
    data['pred'] = compute_prediction_int(data, n=horizon)
    del(data['close'])
    return data.dropna()


---



## Prepare the data with a prediction horizon of 20 days


In [10]:
data = prepare_data(smoothed_aapl, 20)

y = data['pred']

#remove the output from the input
features = [x for x in data.columns if x not in ['gain', 'pred']]
X = data[features]

TypeError: can't multiply sequence by non-int of type 'float'