In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [None]:
data = pd.read_csv('/kaggle/input/gamestop-historical-stock-prices/GME_stock.csv')

In [None]:
data.head()

# Basic EDA

In [None]:
profile = ProfileReport(data, title="Pandas Profiling Report")

In [None]:
profile

In [None]:
data.info()

In [None]:
# Converting the date column into datetime index
data['date'] = pd.to_datetime(data['date'])
data.set_index('date',inplace=True)
data.head()

In [None]:
print(data.index)
print('\nUnique dates in our data: ', len(data.index.unique()), 'Days')

Since we have now created a column for each category, we can see there no longer repeated values in the Datetime Index. 

## Generating a complete Index and Setting Frequency
Since we are using daily data, we would like to set a daily frequency. We see our data has a length of 4773 days. By subtracting the smallest date from the largest date, we can tell there are some days missing:

In [None]:
print('\nUnique dates in our data: ', len(data.index.unique()), 'Days')
our_date_range = data.index.max() - data.index.min()

# Calculate number of days in date range
print('Total days in our date range:', our_date_range.days, 'Days')

In [None]:
new_index = pd.date_range(data.index.min(), data.index.max())
data_new = data.reindex(new_index, fill_value=0)
data_new

In [None]:
sales_weekly = data_new.resample('W').sum()
print('Weekly Sales')
print(sales_weekly.head(), '\n')

sales_monthly = data_new.resample('M').sum()
print('Monthly Sales')
print(sales_monthly.head(), '\n')

sales_quarterly = data_new.resample('Q').sum()
print('Quarterly Sales')
print(sales_quarterly.head(), '\n')

sales_annual = data_new.resample('Y').sum()
print('Annual Sales')
print(sales_annual.head())

In [None]:
sales_quarterly['close_price'].plot(figsize=(13,5))
sales_monthly['close_price'].plot(figsize=(13,5))
sales_weekly['close_price'].plot(figsize=(13,5), title='Close Price')

In [None]:
sales_quarterly['open_price'].plot(figsize=(13,5))
sales_monthly['open_price'].plot(figsize=(13,5))
sales_weekly['open_price'].plot(figsize=(13,5), title='Open Price')

In [None]:
sales_quarterly['open_price'].plot(figsize=(13,5))
sales_monthly['open_price'].plot(figsize=(13,5))
sales_weekly['open_price'].plot(figsize=(13,5), title='Open Price')

In [None]:
# Plotting the data from december 2020 to january 2021 to view the trend
fig = px.line(data, x=data.index, y=data.columns, 
              range_x=['2020-12-01','2021-01-28'],
              title='Plot of values for December 20 and January 21')
fig.show()

In [None]:
# Plotting the total amount traded
data['total_amount_traded'] = data['open_price']*data['volume']

fig = px.line(data, x=data.index, y=data.total_amount_traded,
              title='Plot of total amount traded')
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [None]:
# Plotting the exponential moving average for the opening price
data['EWMA12'] = data['open_price'].ewm(span=12).mean()
#data[['open_price','EWMA12']].plot(figsize=(16,8))
fig = px.line(data[['EWMA12']], x=data.index, y=data.open_price,
              title='Moving average of opening price')
fig.show()

In [None]:
# Plotting candlestick chart
fig = go.Figure(data=[go.Candlestick(x=data.index,
                open=data['open_price'],
                high=data['high_price'],
                low=data['low_price'],
                close=data['close_price'])])

fig.show()

In [None]:
# Scatter and density plots
def plotScatterMatrix(data, plotSize, textSize):
    data = data.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    data = data.dropna('columns')
    data = data[[col for col in data if data[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(data)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    data = data[columnNames]
    ax = pd.plotting.scatter_matrix(data, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = data.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:
plotScatterMatrix(data, 18, 10)

# Model development

## LSTM

In [None]:
# Importing required libraries
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, SimpleRNN, Activation
from sklearn.metrics import mean_squared_error

In [None]:
#creating dataframe
data = data.sort_index(ascending=True, axis=0)
train_data = pd.DataFrame(index=range(0,len(data)),columns=['Date', 'Close'])
for i in range(0,len(data)):
    train_data['Date'][i] = data.index[i]
    train_data['Close'][i] = data['close_price'][i]
train_data.head()

In [None]:
#setting index
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data.set_index('Date',inplace=True)
train_data.head()

In [None]:
# Creating train and test sets
dataset = train_data.values

train = dataset[0:3773,:]
valid = dataset[1000:,:]

In [None]:
# Feature scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(dataset)

In [None]:
# Converting dataset into x_train and y_train for 60 timesteps
x_train, y_train = [], []
for i in range(60,len(train)):
    x_train.append(scaled_data[i-60:i,0])
    y_train.append(scaled_data[i,0])
x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))

In [None]:
# Build the LSTM model
regressor = Sequential()

regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (x_train.shape[1], 1)))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))

regressor.add(Dense(units = 1))

regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.fit(x_train, y_train, epochs = 100, batch_size = 32)

In [None]:
# Predicting values, using past 60 from the train data
inputs = train_data[len(train_data) - len(valid) - 60:].values
inputs = inputs.reshape(-1,1)
inputs  = scaler.transform(inputs)

In [None]:
X_test = []
for i in range(60,inputs.shape[0]):
    X_test.append(inputs[i-60:i,0])
X_test = np.array(X_test)

In [None]:
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
closing_price_pred = regressor.predict(X_test)
closing_price_pred = scaler.inverse_transform(closing_price_pred)

In [None]:
rms=np.sqrt(np.mean(np.power((valid-closing_price_pred),2)))
rms

In [None]:
# Plotting
train = train_data[:3773]
valid = train_data[1000:]
valid['Predictions'] = closing_price_pred
plt.plot(train['Close'])
plt.plot(valid[['Close','Predictions']])

In [None]:
len(valid)

# Simple RNN

In [None]:
# Build the Simple RNN model
regressor = Sequential()

regressor.add(SimpleRNN(units = 50, return_sequences = True, input_shape = (x_train.shape[1], 1)))
regressor.add(Dropout(0.2))

regressor.add(SimpleRNN(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

regressor.add(SimpleRNN(units = 50))
regressor.add(Dropout(0.2))

regressor.add(Dense(units = 1))

regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.fit(x_train, y_train, epochs = 100, batch_size = 32)

In [None]:
# Predicting values, using past 60 from the train data
inputs = train_data[len(train_data) - len(valid) - 60:].values
inputs = inputs.reshape(-1,1)
inputs  = scaler.transform(inputs)

In [None]:
X_test = []
for i in range(60,inputs.shape[0]):
    X_test.append(inputs[i-60:i,0])
X_test = np.array(X_test)

In [None]:
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
closing_price_pred = regressor.predict(X_test)
closing_price_pred = scaler.inverse_transform(closing_price_pred)

In [None]:
rms=np.sqrt(np.mean(np.power((valid-closing_price_pred),2)))
rms

In [None]:
# Plotting
train = train_data[:3773]
valid = train_data[1000:]
valid['Predictions'] = closing_price_pred
plt.plot(train['Close'])
plt.plot(valid[['Close','Predictions']])