<a href="https://colab.research.google.com/github/rubin-r12/stock-price-prediction/blob/main/StockPrice_LSTM_ARIMA_Prophet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import the libraries
import warnings
warnings.filterwarnings("ignore")

import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.arima_model import ARIMA

from keras.models import Sequential
from keras.layers import LSTM, Dense
from prophet import Prophet

In [4]:
#!pip install --upgrade prophet

In [None]:
# Upload your kaggle.json API token to /content
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download paultimothymooney/stock-market-data

# Unzip the dataset
!unzip stock-market-data.zip

### Data Extraction

In [23]:
# Path to the dataset
dataset_path = '/content/stock_market_data'

# Define the folders
folders = ['forbes2000','nasdaq', 'nyse', 'sp500']

# Print a menu to choose a folder
print("Choose a folder:")
for i, folder in enumerate(folders):
    print(f"{i+1}. {folder}")

# Get the user's choice
while True:
    choice = input("Enter the number of your choice: ")
    if choice.isdigit() and 1 <= int(choice) <= len(folders):
        break
    else:
        print("Invalid choice. Please try again.")

# Get the selected folder
selected_folder = folders[int(choice) - 1]

# Switch to the selected folder
os.chdir(os.path.join(dataset_path, selected_folder, 'csv'))

# Get a list of all CSV files in the folder
csv_files = glob.glob('*.csv')

# Combine the data from all files into a single DataFrame, handling errors and printing error file
combined_data_list = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        combined_data_list.append(df)
    except pd.errors.ParserError as e:
        print(f"Error reading file {file} in folder {selected_folder}: {e}")  # Print the folder of the error file

if combined_data_list:
    combined_data = pd.concat(combined_data_list, ignore_index=True)
    print(combined_data.head())
else:
    print("No data could be read successfully.")

Choose a folder:
1. forbes2000
2. nasdaq
3. nyse
4. sp500
Enter the number of your choice: 3
         Date        Low       Open      Volume       High      Close  \
0  09-04-2014  16.320000  16.750000  24900260.0  17.850000  17.120001   
1  10-04-2014  17.000000  17.350000   2444485.0  17.450001  17.200001   
2  11-04-2014  16.719999  16.959999   1545854.0  17.090000  16.860001   
3  14-04-2014  16.500000  17.040001    627864.0  17.160000  16.650000   
4  15-04-2014  16.462999  16.650000    443692.0  16.930000  16.900000   

   Adjusted Close  
0       17.120001  
1       17.200001  
2       16.860001  
3       16.650000  
4       16.900000  


In [31]:
#Length of the dataframe
data = combined_data.copy()
len(data)

6994408

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6994408 entries, 0 to 6994407
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Date            object 
 1   Low             float64
 2   Open            float64
 3   Volume          float64
 4   High            float64
 5   Close           float64
 6   Adjusted Close  float64
dtypes: float64(6), object(1)
memory usage: 373.5+ MB


In [33]:
print(df.describe())

               Low         Open        Volume         High        Close  \
count  2935.000000  2935.000000  2.935000e+03  2935.000000  2935.000000   
mean     87.621201    89.557918  6.976558e+04    91.212674    89.234923   
std     148.771078   151.859937  5.305401e+05   154.496477   151.362371   
min       0.000100     0.000100  0.000000e+00     0.000100     0.000100   
25%       0.003535     0.003700  5.690000e+02     0.004400     0.004000   
50%       0.350000     0.372000  2.367000e+03     0.410000     0.366440   
75%     122.369999   125.000000  1.290200e+04   125.750000   125.000000   
max     565.000000   600.000000  1.724188e+07   600.000000   575.000000   

       Adjusted Close  
count     2935.000000  
mean        88.925609  
std        150.658840  
min          0.000100  
25%          0.004000  
50%          0.366440  
75%        125.000000  
max        571.566833  


In [34]:
# Check for missing values
print(df.isnull().sum())

Date              0
Low               0
Open              0
Volume            0
High              0
Close             0
Adjusted Close    0
dtype: int64


## Data Preprocessing

In [35]:
#Convert the Date column to a datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y') # Specify the correct format

#Set the Date column as the index of the dataframe.
data.set_index('Date', inplace=True)

In [40]:
data.head()

Unnamed: 0_level_0,Low,Open,Volume,High,Close,Adjusted Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-04-09,16.32,16.75,24900260.0,17.85,17.120001,17.120001
2014-04-10,17.0,17.35,2444485.0,17.450001,17.200001,17.200001
2014-04-11,16.719999,16.959999,1545854.0,17.09,16.860001,16.860001
2014-04-14,16.5,17.040001,627864.0,17.16,16.65,16.65
2014-04-15,16.462999,16.65,443692.0,16.93,16.9,16.9


In [39]:
data.tail()

Unnamed: 0_level_0,Low,Open,Volume,High,Close,Adjusted Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-05,0.0004,0.0004,0.0,0.0004,0.0004,0.0004
2022-12-06,0.0004,0.0004,0.0,0.0004,0.0004,0.0004
2022-12-07,0.0004,0.0004,0.0,0.0004,0.0004,0.0004
2022-12-08,0.0004,0.0004,0.0,0.0004,0.0004,0.0004
2022-12-09,0.0004,0.0004,0.0,0.0004,0.0004,0.0004


In [None]:

# Calculate summary statistics
summary_stats = data.describe()
print(summary_stats)
# This cell calculates and prints summary statistics (mean, std, min, max, etc.) for the dataframe.

# Task 4: Model Implementation
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# This cell splits the data into training and testing sets (80% for training and 20% for testing).

# ARIMA Model
arima_model = ARIMA(train_data, order=(5,1,0))
arima_model_fit = arima_model.fit()
# This cell creates and fits an ARIMA model to the training data.

# LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(50, input_shape=(train_data.shape[1], 1)))
lstm_model.add(Dense(1))
lstm_model.compile(loss='mean_squared_error', optimizer='adam')
lstm_model.fit(train_data, epochs=50, batch_size=32)
# This cell creates and trains an LSTM model on the training data.

# Prophet Model
prophet_model = Prophet()
prophet_model.fit(train_data)
# This cell creates and fits a Prophet model to the training data.

# Task 5: Model Evaluation
# Evaluate the models on the test data
arima_pred = arima_model_fit.predict(start=len(train_data), end=len(data)-1, typ='levels')
lstm_pred = lstm_model.predict(test_data)
prophet_pred = prophet_model.make_future_dataframe(periods=len(test_data))
prophet_pred = prophet_model.predict(prophet_pred)

# Calculate evaluation metrics
arima_mae = mean_absolute_error(test_data, arima_pred)
lstm_mae = mean_absolute_error(test_data, lstm_pred)
prophet_mae = mean_absolute_error(test_data, prophet_pred)

print('ARIMA MAE:', arima_mae)
print('LSTM MAE:', lstm_mae)
print('Prophet MAE:', prophet_mae)
# This cell evaluates the models on the test data and calculates the mean absolute error (MAE) for each model.

# Task 6: Reporting
# Print the key findings
print('Key Findings:')
print('The ARIMA model has an MAE of', arima_mae)
print('The LSTM model has an MAE of', lstm_mae)
print('The Prophet model has an MAE of', prophet_mae)
# This cell prints the key findings, including the MAE for each model.