In [138]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

In [49]:
import json
import urllib.request

#QUERY_URL = "https://www.alphavantage.co/query?function={REQUEST_TYPE}&apikey={KEY}&symbol={SYMBOL}&outputsize=full"
API_KEY = "GDFMG7QXIZS4308T"

def _request(symbol, req_type):
    with urllib.request.urlopen(QUERY_URL.format(REQUEST_TYPE=req_type, KEY=API_KEY, SYMBOL=symbol)) as req:
        data = req.read().decode("UTF-8")
    return data

def get_daily_data(symbol):
    return json.loads(_request(symbol, 'TIME_SERIES_DAILY'))

In [50]:
json_data = get_daily_data('AAPL')

In [51]:
json_data.keys()

dict_keys(['Meta Data', 'Time Series (Daily)'])

In [119]:
apple = pd.DataFrame(json_data['Time Series (Daily)']).T
apple.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. volume
2020-01-06,293.79,299.96,292.75,299.8,29622057
2020-01-03,297.15,300.58,296.5,297.43,36633878
2020-01-02,296.24,300.6,295.19,300.35,33911864
2019-12-31,289.93,293.68,289.52,293.65,25247625
2019-12-30,289.46,292.69,285.22,291.52,36059614


In [120]:
apple.rename_axis("date", axis='index', inplace=True)

In [121]:
apple_df = apple.rename(columns={'1. open':'open', '2. high': 'high', '3. low': 'low', '4. close': 'close', '5. volume': 'volume'})
apple_df.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-06,293.79,299.96,292.75,299.8,29622057
2020-01-03,297.15,300.58,296.5,297.43,36633878
2020-01-02,296.24,300.6,295.19,300.35,33911864
2019-12-31,289.93,293.68,289.52,293.65,25247625
2019-12-30,289.46,292.69,285.22,291.52,36059614


In [122]:
apple_df.reset_index('date', inplace = True)

In [123]:
apple_df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume'], dtype='object')

In [124]:
apple_df['date'] = pd.to_datetime(apple_df['date'])
apple_data = apple_df[apple_df['date'] > '2016-12-31']


In [125]:
apple_data.tail()

Unnamed: 0,date,open,high,low,close,volume
752,2017-01-09,117.95,119.43,117.94,118.99,33561948
753,2017-01-06,116.78,118.16,116.47,117.91,31751900
754,2017-01-05,115.92,116.8642,115.81,116.61,22193587
755,2017-01-04,115.85,116.51,115.75,116.02,21118116
756,2017-01-03,115.8,116.33,114.76,116.15,28781865


In [94]:
apple_data.to_csv('../ML-Project/apple.csv', header=True, index=True)

# Linear Regression

Introduction

The most basic machine learning algorithm that can be implemented on this data is linear regression. The linear regression model returns an equation that determines the relationship between the independent variables and the dependent variable.

The equation for linear regression can be written as:

Here, x1, x2,….xn represent the independent variables while the coefficients θ1, θ2, …. θn  represent the weights. You can refer to the following article to study linear regression in more detail:


we do not have a set of independent variables. We have only the dates instead. Let us use the date column to extract features like – day, month, year,  mon/fri etc. and then fit a linear regression model.

Implementation

We will first sort the dataset in ascending order and then create a separate dataset so that any new feature created does not affect the original data.

In [158]:
apple_data.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume'], dtype='object')

In [159]:
#setting index as date values
#apple_data['date'] = pd.to_datetime(apple_data.date,format='%Y-%m-%d')
#apple_data.index = apple_data['date']
#sorting
data = apple_data.sort_index(ascending=True, axis=0)
#creating a separate dataset
new_data = pd.DataFrame(index=range(0,len(apple_data)),columns=['date', 'close'])

for i in range(0,len(data)):
    new_data['date'][i] = data['date'][i]
    new_data['close'][i] = data['close'][i]






In [160]:
new_data.head()
new_data['date'].dt.day_name()
new_data['dow'] = new_data['date'].dt.day_name()


In [161]:
new_data.head()

Unnamed: 0,date,close,dow
0,2017-01-03 00:00:00,116.15,Tuesday
1,2017-01-04 00:00:00,116.02,Wednesday
2,2017-01-05 00:00:00,116.61,Thursday
3,2017-01-06 00:00:00,117.91,Friday
4,2017-01-09 00:00:00,118.99,Monday


Apart from this, we can add our own set of features that we believe would be relevant for the predictions. 
For instance, my hypothesis is that the first and last days of the week could potentially affect the closing price of the stock far more than the other days.
So I have created a feature that identifies whether a given day is Monday/Friday or Tuesday/Wednesday/Thursday. This can be done using the following lines of code:

If the day of week is equal to 0 or 4, the column value will be 1, otherwise 0. Similarly, you can create multiple features.
If you have some ideas for features that can be helpful in predicting stock price, please share in the comment section.

In [162]:
new_data['mon_fri'] = 0
for i in range(0,len(new_data)):
    if (new_data['dow'][i] == 0 or new_data['dow'][i] == 4):
    
        new_data['mon_fri'][i] = 1
    else:
        new_data['mon_fri'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [163]:
#split into train and validation
train = new_data[:756]
test = new_data[756:]
x_train = train.drop('close', axis=1)
y_train = train['close']
x_test = test.drop('close', axis=1)
y_test = test['close']

#implement linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'