# **sTOCK pRICE**

In [None]:
import time
import numpy as np
import pandas as pd
import pandas_datareader as pdr

from keras.layers import LSTM
from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed
from keras.layers.core import Dense, Activation, Dropout

from sklearn.preprocessing import MinMaxScaler



import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GroupKFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import *


import xgboost as xgb


from sklearn.preprocessing import PowerTransformer


from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels import tsa
from scipy import stats

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from IPython import display, utils


def set_seed(seed=2020):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

In [None]:
import pandas_datareader as pdr
def get_raw_data(index_name,retry_attempts = 3):   
    if index_name:
        while retry_attempts > 0 :
            try:
                df = pdr.get_data_yahoo(index_name)
                new_df = df.reindex(index=pd.date_range(df.index.min(), 
                                          df.index.max(), 
                                          freq='D')).fillna(method='ffill')
                retry_attempts = 0
                return new_df
            except:
                print("Data pull failed. {} retry attempts remaining".\
                      format(retry_attempts))
                retry_attempts = retry_attempts - 1
    else:
        print("Invalid usage. Parameter index_name is required")
    return None

In [None]:
sp_df = get_raw_data('^GSPC')
sp_close_series = sp_df.Close
sp_close_series.plot(figsize=(15, 7), color = 'teal')
sp_df.head()

In [None]:
sp_df.info()

In [None]:
sp_df.reset_index(inplace=True)
sp_df

In [None]:
sp_df.columns

In [None]:
sp_df.columns = ['Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close']

sp_df.head()

In [None]:
sp_df.Date.min(), sp_df.Date.max()

In [None]:
feats = ['Date', 'Close', 'Volume']
train= sp_df[feats].copy()
train.head()

In [None]:
ts_series = train.Close

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 17,15
rcParams['lines.color'] = 'teal'


result = seasonal_decompose(ts_series, model='additive', period=30)
sns.set()

plt.style.use('bmh')
result.plot()

plt.show()

In [None]:
train.Date.dtypes

### Building blocks for downsampling

In [None]:
train['year'] = train.Date.dt.year
train['month'] = train.Date.dt.month
train['day'] = train.Date.dt.day
train['week']=train.Date.dt.week
train['quarter']=train.Date.dt.quarter
train.head()

In [None]:
train['month_block'] = train['year'].astype(str) + train['month'].astype(str)
train.head(20)

In [None]:
train['week_block'] = train['year'].astype(str) + train['month'].astype(str) + train['week'].astype(str)
train.head(20)

In [None]:
train['month_block'].unique().size

In [None]:
train['week_block'].unique().size

## **Frequency: Week**

In [None]:
x = train.groupby(['week_block'])['Close'].mean().rename('mean_Close').reset_index()
#x.sort_values(['month','cinema_code'], inplace = True)
x.head()

In [None]:
x.shape

In [None]:
x.week_block.min(), x.week_block.max()

In [None]:
def build_lagandroll(df,target,  width = [2, 3]):
    

    for c in width:
        shifted = target.shift(c)
        df['lag_'+str(c)] = shifted
        window = target.rolling(window=c)
        dataframe = pd.concat([window.min(), window.mean(), window.max(), window.std()], axis=1)
        dataframe.columns = ['roll'+str(c)+'_min', 'roll'+str(c)+'_mean', 'roll'+str(c)+'_max', 'roll'+str(c)+'_var']
        df = pd.concat([df, dataframe], axis=1)
    return df

target = x.pop('mean_Close')

df = build_lagandroll(x, target, width=[1, 2,3, 5])

df.shape, df.week_block.unique().size, 

In [None]:
sns.set()
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 7))
sns.distplot(target,bins=50, fit=norm, kde=True, color='teal')

In [None]:
sns.set()
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 7))
sns.distplot(np.sqrt(target),bins=50, fit=norm, kde=True, color='m')

In [None]:
(df.head(10))

In [None]:
df.shape

In [None]:
le = LabelEncoder()
df['week_block'] = le.fit_transform(df.week_block.astype(str))
df.week_block.head()

In [None]:
df.week_block.describe()

In [None]:
df = pd.concat([df, target], axis=1)
train = df.copy()
train = train[train.week_block < 300]
y_train = train.pop('mean_Close')

val = df.copy()
val =  val[val.week_block >= 300 ]
y_val = val.pop('mean_Close')

train.shape, y_train.shape, val.shape, y_val.shape

In [None]:
del train['week_block']
del val['week_block']

In [None]:
import xgboost as xgb


dxtrain = xgb.DMatrix(train, label=y_train)
dxtest = xgb.DMatrix(val, label=y_val)

xgb_params = {
    'objective': 'reg:linear',  # error evaluation for multiclass training
    'booster':'gbtree',
    'max_depth':5,
    
    'eta':0.01, 
    'subsample':0.7,
    'colsample_bytree':0.7,
    #'lambda':2, 
    'alpha':2,
    'gamma':1
}
xgb_params['eval_metric'] = ['rmse']
num_rounds = 2000
watchlist  = [(dxtrain,'train'), (dxtest,'test')]
model = xgb.train(xgb_params, dxtrain, num_rounds, watchlist, verbose_eval=100, early_stopping_rounds=150)

In [None]:
print(model.best_ntree_limit)
xgb_pred = model.predict(dxtest,ntree_limit=model.best_ntree_limit)

In [None]:
xgb_pred

In [None]:
y_val

In [None]:
plt.style.use('fivethirtyeight')

plt.figure(figsize=(20, 6))
plt.plot(val.index, y_val, 'k', label = 'Actuals', linewidth=7)
plt.plot(val.index, xgb_pred, 'darkred', label = 'Predicted', linewidth=7)

## **Sequence Modeling**

In [None]:
TRAIN_PERCENT = 0.9
def get_seq_train_test(time_series, scaling=True,train_size=0.9):
    scaler = None
    if scaling:
        scaler = MinMaxScaler(feature_range=(0, 1))
        time_series = np.array(time_series).reshape(-1,1)
        scaled_stock_series = scaler.fit_transform(time_series)
    else:
        scaled_stock_series = time_series
        
    train_size = int(len(scaled_stock_series) * train_size)

    train = scaled_stock_series[0:train_size]
    test = scaled_stock_series[train_size:len(scaled_stock_series)]
    
    return train,test,scaler 


train,test,scaler = get_seq_train_test(sp_close_series,
                                   scaling=True,
                                   train_size=TRAIN_PERCENT)

train = np.reshape(train,(1,train.shape[0],1))
test = np.reshape(test,(1,test.shape[0],1))

train_x = train[:,:-1,:]
train_y = train[:,1:,:]

test_x = test[:,:-1,:]
test_y = test[:,1:,:]

print("Data Split Complete")

print("train_x shape={}".format(train_x.shape))
print("train_y shape={}".format(train_y.shape))
print("test_x shape={}".format(test_x.shape))
print("test_y shape={}".format(test_y.shape))

In [None]:
VERBOSE = True
def get_seq_model(hidden_units=7,input_shape=(1,1),verbose=False):
    # create and fit the LSTM network
    model = Sequential()
    # samples*timesteps*featuress

    model.add(LSTM(input_shape=input_shape, 
                   units = hidden_units, 
                   return_sequences=True
    ))
    
    # readout layer. TimeDistributedDense uses the same weights for all
    # time steps.
    model.add(TimeDistributed(Dense(1)))
    start = time.time()
    
    model.compile(loss="mse", optimizer="adam")
    
    if verbose:
        print("> Compilation Time : ", time.time() - start)
        print(model.summary())
        
    return model




seq_lstm_model=None
try:
    seq_lstm_model = get_seq_model(input_shape=(train_x.shape[1],1),
                                                verbose=VERBOSE)   
except:
    print("Model Build Failed. Trying Again")
    seq_lstm_model = get_seq_model(input_shape=(train_x.shape[1],1),
                                                verbose=VERBOSE)


In [None]:
seq_lstm_model.fit(train_x, train_y, 
               epochs=150, batch_size=8, 
               verbose=1)
print("Model Fit Complete")

In [None]:
import math
from sklearn.metrics import mean_squared_error
trainPredict = seq_lstm_model.predict(train_x)
trainScore = math.sqrt(mean_squared_error(train_y[0], trainPredict[0]))
print('Train Score: %.2f RMSE' % (trainScore))

In [None]:
from keras.preprocessing.sequence import pad_sequences

testPredict = pad_sequences(test_x,
                                maxlen=train_x.shape[1],
                                padding='post',
                                dtype='float64')


In [None]:
testPredict = seq_lstm_model.predict(testPredict)

# evaluate performances
testScore = math.sqrt(mean_squared_error(test_y[0], 
                                         testPredict[0][:test_x.shape[1]]))
print('Test Score: %.2f RMSE' % (testScore))

In [None]:
trainPredict = scaler.inverse_transform(trainPredict.reshape(-1, 1))#trainPredict.shape[1]))
testPredict = scaler.inverse_transform(testPredict.reshape(-1, 1))#testPredict.shape[1]))

In [None]:
train_size = len(trainPredict)+1
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20, 10))
plt.plot(sp_close_series.index,
         sp_close_series.values,c='grey',
         alpha=0.5,label='True Data')
plt.plot(sp_close_series.index[1:train_size],
         trainPredict,label='Training Fit', c='k')
plt.plot(sp_close_series.index[train_size+1:],
         testPredict[:test_x.shape[1]],label='Testing Forecast', c='darkred', linewidth=4)
plt.title('Forecast Plot')
plt.legend()
plt.show()