# Nasdaq Stock Price Prediction

#### Importing Necessary Libraries

In [25]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#### Loading Data

In [9]:
train_data = pd.read_csv('C:/Users/Sree/Downloads/DL-FInal/train.csv')
train_data.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


#### Preprocessing 

checking for null values

In [10]:
train_data.isnull().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 220
imbalance_buy_sell_flag          0
reference_price                220
matched_size                   220
far_price                  2894342
near_price                 2857180
bid_price                      220
bid_size                         0
ask_price                      220
ask_size                         0
wap                            220
target                          88
time_id                          0
row_id                           0
dtype: int64

In [12]:
df= train_data

Imputation with mean/median

In [13]:
df['imbalance_size'].fillna(df['imbalance_size'].median(), inplace=True)
df['reference_price'].fillna(df['reference_price'].median(), inplace=True)

Forward fill for time-series variables like far_price and ask_price as we need to see future trends for value filling assumptions for predictive analytics, especially when the missing values are a huge sum of the dataset

In [14]:
df['far_price'].fillna(method='ffill', inplace=True)
df['near_price'].fillna(method='bfill', inplace=True)

We still have null values where forward fill didnt work in far_price so we shall fill them up with mean to take the average



In [15]:
df['far_price'].fillna(df['far_price'].mean(), inplace=True)

We shall fill the reamining ones with average too in order to least impact the data distribution

In [16]:
df['matched_size'].fillna(df['matched_size'].mean(), inplace=True)
df['bid_price'].fillna(df['bid_price'].mean(), inplace=True)
df['ask_price'].fillna(df['ask_price'].mean(), inplace=True)
df['wap'].fillna(df['wap'].mean(), inplace=True)
df['target'].fillna(df['target'].mean(), inplace=True)

Checking null values again

In [17]:
df.isnull().sum()

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
row_id                     0
dtype: int64

In [18]:
train_data = pd.get_dummies(train_data, columns=['imbalance_buy_sell_flag'], prefix='imbalance_flag')
## Convert categorical imbalance_buy_sell_flag to one-hot encoding

In [21]:
price_columns = ['bid_price', 'ask_price', 'wap']
train_data[price_columns] = train_data[price_columns] / train_data['wap'].values[:, np.newaxis]
# Normalize price-related columns relative to the stock wap

In [22]:
selected_features = ['imbalance_size', 'imbalance_flag_-1', 'imbalance_flag_0', 'imbalance_flag_1',
                      'reference_price', 'matched_size', 'far_price', 'near_price',
                      'bid_price', 'ask_price', 'wap', 'seconds_in_bucket']
# Feature selection 

In [23]:
X = train_data[selected_features]
y = train_data['target']

# Select features and target variable

#### Splitting the data 

In [27]:

scaler = StandardScaler()
X = scaler.fit_transform(X)
# training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (4190384, 12) (4190384,)
Testing set shape: (1047596, 12) (1047596,)


#### simple neural network using TensorFlow's Keras API

In [28]:
import tensorflow as tf
from tensorflow.keras import layers, models

# neural network model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using MSE for regression

# Display the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### Training and modifying the built models

In [30]:
from tensorflow.keras.callbacks import EarlyStopping

# Define a simpler neural network model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Display the model summary
model.summary()

#Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the test set
loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 64)                832       
                                                                 
 dense_8 (Dense)             (None, 32)                2080      
                                                                 
 dense_9 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch

#### Improved model with more layers 

In [31]:
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization

# Improved model with more layers, neurons, and batch normalization
improved_model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),  # Batch normalization for improved convergence
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(1)  
])

# Compile the improved model
improved_model.compile(optimizer='adam', loss='mse')

# Display the model summary
improved_model.summary()

# Train the improved model with early stopping
improved_history = improved_model.fit(X_train, y_train, epochs=50, batch_size=256, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the improved model on the test set
improved_loss = improved_model.evaluate(X_test, y_test)
print(f"Improved Model Test Loss: {improved_loss}")


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 128)               1664      
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 dense_11 (Dense)            (None, 64)                8256      
                                                                 
 batch_normalization_1 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dense_12 (Dense)            (None, 32)                2080      
                                                                 
 batch_normalization_2 (Bat  (None, 32)               

modifying the model to incorporate LSTM layers, which are well-suited for sequential data like time series

In [32]:
from tensorflow.keras.layers import LSTM, Dense, BatchNormalization

# Reshape the data to fit the LSTM input shape
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Improved model with LSTM layers
lstm_model = models.Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True),
    BatchNormalization(),
    LSTM(50, activation='relu'),
    BatchNormalization(),
    Dense(1)
])

# Compile the LSTM model
lstm_model.compile(optimizer='adam', loss='mse')

# Display the model summary
lstm_model.summary()

# Train the LSTM model with early stopping
lstm_history = lstm_model.fit(X_train_lstm, y_train, epochs=50, batch_size=256, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the LSTM model on the test set
lstm_loss = lstm_model.evaluate(X_test_lstm, y_test)
print(f"LSTM Model Test Loss: {lstm_loss}")


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 50)             12600     
                                                                 
 batch_normalization_3 (Bat  (None, 1, 50)             200       
 chNormalization)                                                
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 batch_normalization_4 (Bat  (None, 50)                200       
 chNormalization)                                                
                                                                 
 dense_14 (Dense)            (None, 1)                 51        
                                                                 
Total params: 33251 (129.89 KB)
Trainable params: 3305

#### Evaluating the model 

In [35]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = improved_model.predict(X_test)

# Define a classification threshold
threshold = 0.5

# predictions based on the threshold
y_pred_class = (y_pred > threshold).astype(int)
y_test_class = (y_test > threshold).astype(int)

# Calculate accuracy in batches
batch_size = 10000  
num_samples = len(y_test)

accuracy = 0.0

for i in range(0, num_samples, batch_size):
    y_pred_batch = y_pred_class[i:i + batch_size]
    y_test_batch = y_test_class[i:i + batch_size]

    accuracy += accuracy_score(y_test_batch, y_pred_batch) * len(y_test_batch)

accuracy /= num_samples

# Print the accuracy
print(f"Accuracy: {accuracy}")


Accuracy: 0.5645573293521549


Generally for regression tasks accuracy might not be the best accuracy, so calculating MSE too

In [36]:
from sklearn.metrics import mean_squared_error

# Predict on the test set
y_pred = improved_model.predict(X_test)

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

# Print the Mean Squared Error
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 86.6628070065235
