In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import json
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Time series decomposition
#!py -m pip install stldecompose
#from stldecompose import decompose

# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

pio.renderers.default='iframe'

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
# init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

In [2]:
import yfinance as yf  # for downloading stock data
from pandas.api.types import is_numeric_dtype
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

dataframe = yf.download("INTC", period="max")
#print ("shape = ", dataframe.shape)
#print(dataframe.head())

# we don't need Adj Close column
dataframe = dataframe.drop(columns=['Adj Close'])

# eliminate data before 2015
dataframe['Date'] = pd.to_datetime(dataframe.index)
dataframe = dataframe[(dataframe['Date'].dt.year >= 2015)].copy()
dataframe = dataframe.set_index('Date')

# In addition to the index column called 'Date', keep a new column called Date for future use
dataframe['Date'] = pd.to_datetime(dataframe.index)

print ("shape = ", dataframe.shape)
print(dataframe.head())

[*********************100%%**********************]  1 of 1 completed

shape =  (2412, 6)
                 Open       High        Low      Close    Volume       Date
Date                                                                       
2015-01-02  36.669998  37.160000  36.110001  36.360001  23605600 2015-01-02
2015-01-05  36.200001  36.450001  35.939999  35.950001  32785000 2015-01-05
2015-01-06  36.080002  36.230000  35.259998  35.279999  30847600 2015-01-06
2015-01-07  35.639999  36.070000  35.270000  36.020000  27725200 2015-01-07
2015-01-08  36.369999  37.000000  36.259998  36.689999  31765400 2015-01-08





In [3]:
dataframe['Date'] = pd.to_datetime(dataframe['Date'], dayfirst=True)

#drop duplicates
dataframe.sort_values('Date')
dataframe.drop_duplicates('Date', keep='last', inplace=True)

#use Date as the index
#dataframe = dataframe.set_index('Date')
dataframe.index = pd.to_datetime(dataframe['Date'], dayfirst=True)
dataframe.sort_index(inplace=True)


ValueError: 'Date' is both an index level and a column label, which is ambiguous.

In [None]:
# Next, there could be gaps in the observations. Let's fill the gaps by interpolating from the neighboring available data.

from scipy.interpolate import interp1d

# this is a simple way of back-filling and front-filling 
# dataframe = dataframe.resample('H').bfill()
# dataframe = dataframe.resample('H').ffill()

# a better approach is to use interpolation
index = pd.date_range(dataframe.Date.values.min(), dataframe.Date.values.max(), freq='H')
interp_func = interp1d(dataframe.Date.values.astype(float), dataframe.Open.values, kind = 'linear', fill_value = 'extrapolate')
interp_values = interp_func(index.values.astype(float))

# re-create a new DataFrame
dataframe = pd.DataFrame({'Open': interp_values}, index=index)

# re-insert the Date column
dataframe['Date'] = dataframe.index


In [None]:
# plot the series
plt.figure(figsize=(20,8))
plt.plot(dataframe['Open'])

In [None]:
# decompose the series. We have hourly readings that repeat with a yearly seasonality. So use period = 365 * 12
from statsmodels.tsa.seasonal import seasonal_decompose


decomposed = seasonal_decompose(x=dataframe['Open'], model='additive', period=365*24)


decomposed.plot()
plt.show()

In [None]:
linear_interpolation_model = interp1d(decomposed.trend.index.values.astype(float), decomposed.trend.values, kind = "linear", fill_value = 'extrapolate')
X=np.linspace(decomposed.trend.index.values.astype(float).min(), decomposed.trend.index.values.astype(float).max(), 10)
Y=linear_interpolation_model(X)
plt.plot(X, Y)
plt.title("Smoothened Trend Curve")
plt.show()

In [None]:
# use a different method for decomposing the series. Note, this can take several minutes to complete.
#from statsmodels.tsa.seasonal import STL
#stl = STL(dataframe['Open'], period = 24*364)
#res = stl.fit()
#fig = res.plot()