# CAC 40 Stock Price Forecast with ARIMA & LSTM

### Reference Material

* CAC 40 data Donwload: https://www.euronext.com/en/products/indices/FR0003500008-XPAR
* Jupyter Notebook Connect with HDFS: http://nbviewer.jupyter.org/github/ofermend/IPython-notebooks/blob/master/blog-part-1.ipynb
* Time Series Prediction with LSTM: https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
* Data Science with Apache Hadoop: Predicting Airline Delays: https://fr.hortonworks.com/blog/data-science-apacheh-hadoop-predicting-airline-delays/

---

## PART 1. Data pre processing

Start by uploading data from HDFS or Local machine

### 1.1. Import Dataset from HDFS

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### 1.1. Import Dataset from Local Hard Drive

* If you are not using HDFS, import the csv file directly from the local disk.
* Ignore this step if you have imported csv files from HDFS.

In [None]:
cac_df = pd.read_csv('/kaggle/input/cac1data/cac40.csv')
cac_df.head()

### 1.2 Display a summary of statistical measure of this data

In [None]:
cac_df.info() # give the complacte inforamtion of dataset including datatypes null values 

In [None]:
cac_df.describe() # give the statistical informaion of our dataset

### 1.3. Change String Date to Datetime Format

In [None]:
cac_df['Date'] = pd.to_datetime(cac_df['Date']) 
cac_df.Date.head()

In [None]:
print('There are {} number of days in the dataset.'.format(cac_df.shape[0]))

### 1.4. Set Datetime to Index

In [None]:
cac_df.set_index('Date', inplace=True)

In [None]:
cac_df.columns

### 1.5. Feature Selection & Data Resampling

In [None]:
def get_technical_indicators(dataset): #function to generate feature technical indicators
    
    
    # Create 7 and 21 days Moving Average
    dataset['ma7'] = dataset['Close'].rolling(window = 7).mean()
    dataset['ma21'] = dataset['Close'].rolling(window = 21).mean()
    
    #Create MACD
    dataset['26ema'] = dataset['Close'].ewm(span=26).mean()
    dataset['12ema'] = dataset['Close'].ewm(span=12).mean()
    dataset['MACD'] = (dataset['12ema']-dataset['26ema'])
    
    #Create Bollinger Bands
    dataset['20sd'] = dataset['Close'].rolling(window = 20).std()
    dataset['upper_band'] = (dataset['Close'].rolling(window = 20).mean()) + (dataset['20sd']*2)
    dataset['lower_band'] = (dataset['Close'].rolling(window = 20).mean()) - (dataset['20sd']*2)
    
    
    #Create Exponential moving average
    dataset['ema'] = dataset['Close'].ewm(com=0.5).mean()
    
    #Create Momentum
    dataset['momentum'] = (dataset['Close']/100)-1
    #Create ARIMA
    dataset['ARIMA'] = 0
    
    return dataset

In [None]:
cac1_df = get_technical_indicators(cac_df)
cac1_df.head()

In [None]:
cac1_df[['Open','Close']].plot()
plt.show()

### 1.6. Split Dataset to train and test data

In [None]:
train_data, test_data = cac1_df[0:int(len(cac1_df)*0.7)], cac1_df[int(len(cac1_df)*0.7):]
training_data = train_data['Close'].values
test_data = test_data['Close'].values

### 1.7. Plot Training Data & Observation Data Trends

In [None]:
training_data1=pd.Series(training_data)

In [None]:
#plot training_data
training_data1.plot(figsize=(15, 6))
plt.show()

---

## Part 2. CAC 40 Stock Price Forecast with ARIMA

### 2.1. Make First Order Difference or Second Order Difference

#### 2.1.1. Make First Order Difference

In [None]:
cac1_df['First Order Difference'] = cac1_df['Close'] - cac1_df['Close'].shift(1)

#### 2.2.2. Plot the Training Data After the First Order Difference

In [None]:
cac1_df['First Order Difference'].plot(figsize=(12, 6))
plt.show()

### 2.2. Draw ACF&PACF Chart and Select Hyperparameter q&p

#### 2.2.1. Draw ACF Chart and Chose Hyperparameter q in MA Model

In [None]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
import statsmodels.api as sm

fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(training_data1, lags=40, ax=ax1) # 
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(training_data1, lags=40, ax=ax2)# , lags=40

#### 2.2.2. Draw PACF Chart and Chose Hyperparameter p in AR Model

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(cac1_df.Open, lags=40, ax=ax1) # 
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(cac1_df.Open, lags=40, ax=ax2)# , lags=40

### 2.3. Define then train the ARIMA Model

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from pandas import DataFrame
from pandas import datetime

series = cac1_df['Close']
model = ARIMA(series, order=(5, 1, 0))
model_fit = model.fit(disp=0)
print(model_fit.summary())


In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(series)
plt.figure(figsize=(10, 7), dpi=80)
plt.show()

### 2.4. Use ARIMA Model to Predict CAC 40 Stock Price After 2016 (Weekly Forecast)

#### 2.4.1. Use ARIMA Model to Predict

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

X = series.values
train_data, test_data = X[0:int(len(X)*0.7)], X[int(len(X)*0.7):]
history = [x for x in train_data]
predictions = list()
for t in range(len(test_data)):
    model = ARIMA(history, order=(5, 1, 0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test_data[t]
    history.append(obs)

cac1_df['ARIMA'] = pd.DataFrame(predictions)

#### 2.4.2. Evaluation the arima predicted model using RMSE

In [None]:
error = mean_squared_error(test_data, predictions)
print('Test RMSE: %.3f' % error)

#### 2.4.3. Plot the Predict Result

In [None]:
# Plot the predicted (from ARIMA) and real prices

plt.figure(figsize=(12, 6), dpi=100)
plt.plot(test_data, color='black', label='Real')
plt.plot(predictions, color='yellow', label='Predicted')
plt.xlabel('Days')
plt.ylabel('USD')
plt.title('ARIMA model on CAC')
plt.legend()
plt.show()

In [None]:
cac1_df.head(8)

In [None]:
print('Total dataset has {} samples, and {} features.'.format(cac1_df.shape[0], \
                                                              cac1_df.shape[1]))

---

## Part 3. CAC 40 Stock Price Forecast with LSTM [optional]

#### instead of using the ARIMA model use LSTM and do the same steps as ARIMA one. You may also need to add a normalisation step in the pre processing part

In [None]:
cac1_df.head()

In [None]:
print('Total dataset has {} samples, and {} features.'.format(cac1_df.shape[0], \
                                                              cac1_df.shape[1]))

**Following steps are done:**

**1.  Clean up the data-Remove any NAs**

**2.   Create a test, train and validate set**

**3.   Create train for Open**

**4.   Normalize data** 

**5.Create feature and label set**

**6. Train, test data and  check with validation set**

**7. Make a prediction**

**8. Based on this prediction find if the feature extraction method of LSTM works**



In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Flatten

In [None]:
#creating test, train and validate trains
train, validate, test = np.split(cac1_df.sample(frac=1), [int(.6*len(cac1_df)), int(.8*len(cac1_df))])

**Split dataset into train,test and validate sets**

In [None]:
open_training = train.iloc[:, 1:2].values

**Normalize data:
The data is not normalized and the range for each column varies, especially Volume. Normalizing data helps the algorithm in converging i.e. to find local/ global minimum efficiently. I will use MinMaxScaler from Sci-kit Learn. Use a range to keep values similar for that much range**

**Keep a window for the length 2000 for your data between 50 and 500...since our length is slightly more than 2000 ill make it 60 to 450**

In [None]:
#normalise
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0, 1))
open_training = scaler.fit_transform(open_training)
#convert to right shape
features_set_1 = []
labels_1 = []
for i in range(60,450): 
    features_set_1.append(open_training[i-60:i, 0])
    labels_1.append(open_training[i, 0])
    


In [None]:
features_set_1, labels_1 = np.array(features_set_1), np.array(labels_1)
features_set_1 = np.reshape(features_set_1, (features_set_1.shape[0], features_set_1.shape[1], 1))

In [None]:
#training it
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(features_set_1.shape[1],1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(units = 1))
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
model.fit(features_set_1, labels_1, epochs = 100, batch_size = 32,validation_data = (features_set_1, labels_1))


In [None]:
#TESTING THE MODEL
open_testing_processed = test.iloc[:, 1:2].values

In [None]:
#convert test data to right format
open_total = pd.concat((train['Open'], test['Open']), axis=0)

**Start predictions: Reshape, scale and then oredict the model**

In [None]:
test_inputs = open_total[len(open_total) - len(test) - 60:].values

In [None]:
#scaling data
test_inputs = test_inputs.reshape(-1,1)
test_inputs = scaler.transform(test_inputs)

In [None]:
test_features = []
for i in range(60, 151):
    test_features.append(test_inputs[i-60:i, 0])

In [None]:
test_features = np.array(test_features)
test_features.shape
test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], 1))

In [None]:
#make predictions
predictions = model.predict(test_features)

In [None]:
predictions = scaler.inverse_transform(predictions)

**Plot the prediction model for the number of test days and train days**

In [None]:
plt.figure(figsize=(10,6))
plt.plot(open_testing_processed, color='pink', label='Actual Stock Price')
plt.plot(predictions , color='yellow', label='Predicted Stock Price')
plt.title('Actual Value vs Predicted')
plt.xlabel('Date')
plt.ylabel('Predicted Price')
plt.legend()
plt.show()


>**This wasn't a great result with one feature so let's try using more features and then train them on LSTM model**

**USING 5 FEATURES :**

In [None]:
dataset = cac1_df[['Open','High','Low','Close','Turnover']]
dataset.head()

In [None]:
# FUNCTION TO CREATE 1D DATA INTO TIME SERIES DATASET
def new_dataset(dataset, step_size):
	data_X, data_Y = [], []
	for i in range(len(dataset)-step_size-1):
		a = dataset[i:(i+step_size), 0]
		data_X.append(a)
		data_Y.append(dataset[i + step_size, 0])
	return np.array(data_X), np.array(data_Y)

In [None]:
# IMPORTING IMPORTANT LIBRARIES
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM

In [None]:
# FOR REPRODUCIBILITY
np.random.seed(7)

In [None]:
# IMPORTING DATASET 
dataset = dataset.reindex(index = dataset.index[::-1])
# CREATING OWN INDEX FOR FLEXIBILITY
obs = np.arange(1, len(dataset) + 1, 1)
# TAKING DIFFERENT INDICATORS FOR PREDICTION
OHLC_avg = dataset.mean(axis = 1)
HLC_avg = dataset[['High', 'Low', 'Close']].mean(axis = 1)
close_val = dataset[['Close']]

In [None]:
# PLOTTING All INDICATORS IN PLOT
plt.plot(OHLC_avg, 'yellow', label = 'OHLC avg')
plt.plot(close_val, 'blue', label = 'Closing price')
plt.xlabel('Days')
plt.ylabel('OHLC average')
plt.show()


In [None]:
plt.plot(HLC_avg, 'red', label = 'HLC avg')
plt.xlabel('Days')
plt.ylabel('HLC average')
plt.show()

In [None]:
plt.plot(close_val, 'blue', label = 'Closing price')
plt.xlabel('Days')
plt.ylabel('Closing Values')
plt.show()

In [None]:
# PREPARATION OF TIME SERIES DATASE
OHLC_avg = np.reshape(OHLC_avg.values, (len(OHLC_avg),1)) 
scaler = MinMaxScaler(feature_range=(0, 1))
OHLC_avg = scaler.fit_transform(OHLC_avg)
# TRAIN-TEST SPLIT
train_OHLC = int(len(OHLC_avg) * 0.75)
test_OHLC = len(OHLC_avg) - train_OHLC
train_OHLC, test_OHLC = OHLC_avg[0:train_OHLC,:], OHLC_avg[train_OHLC:len(OHLC_avg),:]
# TIME-SERIES DATASET (FOR TIME T, VALUES FOR TIME T+1)
trainX, trainY = new_dataset(train_OHLC, 1)
testX, testY = new_dataset(test_OHLC, 1)
# RESHAPING TRAIN AND TEST DATA
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
step_size = 1


In [None]:
# LSTM MODEL
model = Sequential()
model.add(LSTM(32, input_shape=(1, step_size), return_sequences = True))
model.add(LSTM(16))
model.add(Dense(1))
model.add(Activation('linear'))

In [None]:
# MODEL COMPILING AND TRAINING
model.compile(loss='mean_squared_error', optimizer='adagrad',metrics = ['mae']) # Try mae, adam, adagrad and compare!!!
model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=2)

In [None]:
mae = model.evaluate(testX, testY, batch_size=16)
print('Mean Absolute Error for Y:', mae)

In [None]:
# PREDICTION
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

In [None]:
# DE-NORMALIZING FOR PLOTTING
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# TRAINING RMSE
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train RMSE: %.2f' % (trainScore))

In [None]:
# TEST RMSE
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test RMSE: %.2f' % (testScore))

In [None]:
# CREATING SIMILAR DATASET TO PLOT TRAINING PREDICTIONS
trainPredictPlot = np.empty_like(OHLC_avg)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[step_size:len(trainPredict)+step_size, :] = trainPredict
# CREATING SIMILAR DATASSET TO PLOT TEST PREDICTIONS
testPredictPlot = np.empty_like(OHLC_avg)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(step_size*2)+1:len(OHLC_avg)-1, :] = testPredict

In [None]:
# PLOT OF MAIN OHLC VALUES, TRAIN PREDICTIONS AND TEST PREDICTIONS
plt.plot(trainPredictPlot, 'r', label = 'training set')
plt.plot(testPredictPlot, 'b', label = 'predicted stock price/test set')
plt.legend(loc = 'upper right')
plt.xlabel('Time in Days')
plt.ylabel('Trend of training and prediction data')
plt.show()


In [None]:
# PREDICT FUTURE VALUES
last_val = testPredict[-1]
last_val_scaled = last_val/last_val
next_val = model.predict(np.reshape(last_val_scaled, (1,1,1)))
print("Last Day Value:", np.asscalar(last_val))
print("Next Day Value:", np.asscalar(last_val*next_val))
# print np.append(last_val, next_val)
