In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import ttest_ind, f_oneway
import os

In [4]:
# Define file path
file_path = "yahoo_stock.csv"
# Load dataset
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2015-11-23,2095.610107,2081.389893,2089.409912,2086.590088,3587980000.0,2086.590088
1,2015-11-24,2094.120117,2070.290039,2084.419922,2089.139893,3884930000.0,2089.139893
2,2015-11-25,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
3,2015-11-26,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
4,2015-11-27,2093.290039,2084.129883,2088.820068,2090.110107,1466840000.0,2090.110107


In [5]:
# Load dataset
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2015-11-23,2095.610107,2081.389893,2089.409912,2086.590088,3587980000.0,2086.590088
1,2015-11-24,2094.120117,2070.290039,2084.419922,2089.139893,3884930000.0,2089.139893
2,2015-11-25,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
3,2015-11-26,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
4,2015-11-27,2093.290039,2084.129883,2088.820068,2090.110107,1466840000.0,2090.110107


In [6]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])
# Set Date as index
df.set_index('Date', inplace=True)
# EDA: Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 High         0
Low          0
Open         0
Close        0
Volume       0
Adj Close    0
dtype: int64


In [7]:
# EDA: Descriptive statistics
df.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
count,1825.0,1825.0,1825.0,1825.0,1825.0,1825.0
mean,2660.718673,2632.81758,2647.704751,2647.856284,3869627000.0,2647.856284
std,409.680853,404.310068,407.169994,407.301177,1087593000.0,407.301177
min,1847.0,1810.099976,1833.400024,1829.079956,1296540000.0,1829.079956
25%,2348.350098,2322.25,2341.97998,2328.949951,3257950000.0,2328.949951
50%,2696.25,2667.840088,2685.48999,2683.340088,3609740000.0,2683.340088
75%,2930.790039,2900.709961,2913.860107,2917.52002,4142850000.0,2917.52002
max,3645.98999,3600.159912,3612.090088,3626.909912,9044690000.0,3626.909912


In [15]:
# EDA: Composition
# Plotting closing price over time
fig = px.line(df, x=df.index, y='Close', title='Closing Price Over Time')
fig.update_layout(title_text='Closing Price Over Time', title_x=0.5, template='plotly_dark')
fig.show()

In [16]:
# EDA: Distribution
fig = make_subplots(rows=3, cols=2, subplot_titles=('Close', 'Volume', 'Open', 'High', 'Low', 'Adj Close'))
fig.add_trace(go.Histogram(x=df['Close'], name='Close'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Volume'], name='Volume'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['Open'], name='Open'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['High'], name='High'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['Low'], name='Low'), row=3, col=1)
fig.add_trace(go.Histogram(x=df['Adj Close'], name='Adj Close'), row=3, col=2)
fig.update_layout(title_text='Distribution of Features', title_x=0.5, template='plotly_dark')
fig.show()

In [17]:
# EDA: Relationship
fig = px.scatter_matrix(df, dimensions=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], title='Scatter Matrix')
fig.update_layout(title_text='Scatter Matrix', title_x=0.5, template='plotly_dark')
fig.show()

In [18]:
# EDA: Comparison
# Correlation matrix
correlation = df.corr()
fig = px.imshow(correlation, text_auto=True, title='Correlation Matrix')
fig.update_layout(title_text='Correlation Matrix', title_x=0.5, template='plotly_dark')
fig.show()

In [19]:
# Statistical Tests
# T-test comparing 'High' and 'Low' prices
t_stat, p_value = ttest_ind(df['High'], df['Low'])
t_test_result = {
    'Statistic': [t_stat],
    'p-value': [p_value]
}
t_test_df = pd.DataFrame(t_test_result)
t_test_df

Unnamed: 0,Statistic,p-value
0,2.070802,0.038447


In [20]:
# Display T-test result
fig = px.bar(t_test_df, x=t_test_df.index, y=['Statistic', 'p-value'], barmode='group', title='T-test Result: High vs Low Prices')
fig.update_layout(title_text='T-test Result: High vs Low Prices', title_x=0.5, template='plotly_dark')
fig.show()

In [21]:
# ANOVA test for 'Open', 'High', 'Low', 'Close' prices
anova_stat, anova_p_value = f_oneway(df['Open'], df['High'], df['Low'], df['Close'])
anova_result = {
    'Statistic': [anova_stat],
    'p-value': [anova_p_value]
}
anova_df = pd.DataFrame(anova_result)
anova_df

Unnamed: 0,Statistic,p-value
0,1.432399,0.231203


In [22]:
# Display ANOVA result
fig = px.bar(anova_df, x=anova_df.index, y=['Statistic', 'p-value'], barmode='group', title='ANOVA Test Result: Open, High, Low, Close Prices')
fig.update_layout(title_text='ANOVA Test Result: Open, High, Low, Close Prices', title_x=0.5, template='plotly_dark')
fig.show()

In [23]:
# Data preprocessing: Feature scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['Close'].values.reshape(-1, 1))
# Prepare the data for LSTM
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

# Define time step
time_step = 60
# Create training and test sets
train_size = int(len(scaled_data) * 0.8)
test_size = len(scaled_data) - train_size
train_data, test_data = scaled_data[0:train_size, :], scaled_data[train_size:len(scaled_data), :]
# Create dataset for LSTM
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)
# Reshape input to be [samples, time steps, features] which is required for LSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [24]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [25]:
# Train the model
history = model.fit(X_train, y_train, batch_size=1, epochs=10)


Epoch 1/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 41ms/step - loss: 0.0084
Epoch 2/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 39ms/step - loss: 0.0015
Epoch 3/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 40ms/step - loss: 0.0016
Epoch 4/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 50ms/step - loss: 0.0014
Epoch 5/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 33ms/step - loss: 0.0016
Epoch 6/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 32ms/step - loss: 0.0012
Epoch 7/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 33ms/step - loss: 0.0011
Epoch 8/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 32ms/step - loss: 9.6236e-04
Epoch 9/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 33ms/step - loss: 9.7255e-04
Epoch 10/10
[1m1399/1399[0m [32m━━━━━━━━━━━━

In [26]:
# Predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


In [27]:
# Inverse transform to get actual values
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
y_train = scaler.inverse_transform([y_train])
y_test = scaler.inverse_transform([y_test])
# Calculate RMSE and MAE
train_rmse = np.sqrt(mean_squared_error(y_train[0], train_predict[:, 0]))
train_mae = mean_absolute_error(y_train[0], train_predict[:, 0])
test_rmse = np.sqrt(mean_squared_error(y_test[0], test_predict[:, 0]))
test_mae = mean_absolute_error(y_test[0], test_predict[:, 0])

print(f'Train RMSE: {train_rmse}, Train MAE: {train_mae}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Train RMSE: 37.10749726374919, Train MAE: 29.11610413688349
Test RMSE: 107.23636483265174, Test MAE: 95.74667358398438


In [28]:
# Plotting the results
train_data_len = len(train_data)

train_plot = np.empty_like(scaled_data)
train_plot[:, :] = np.nan
train_plot[time_step:len(train_predict)+time_step, :] = train_predict

test_plot = np.empty_like(scaled_data)
test_plot[:, :] = np.nan
test_plot[len(train_predict)+(time_step*2)+1:len(scaled_data)-1, :] = test_predict
# Create plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=df.index, y=train_plot[:, 0], mode='lines', name='Train Predict', line=dict(color='green')))
fig.add_trace(go.Scatter(x=df.index, y=test_plot[:, 0], mode='lines', name='Test Predict', line=dict(color='red')))
fig.update_layout(title='Stock Price Prediction',
                   xaxis_title='Date',
                   yaxis_title='Stock Price',
                   template='plotly_dark')
fig.show()

In [29]:
def predict_future(model, data, scaler, time_step=60, future_steps=30):
    # Prepare the input data for future prediction
    last_data = data[-time_step:]
    last_data = last_data.reshape(1, time_step, 1)

    # Predict future prices
    future_predictions = []
    for _ in range(future_steps):
        next_pred = model.predict(last_data)
        future_predictions.append(next_pred[0, 0])
        last_data = np.append(last_data[:, 1:, :], [[[next_pred[0, 0]]]], axis=1)

    # Inverse transform the predictions to get actual values
    future_predictions = np.array(future_predictions).reshape(-1, 1)
    future_predictions = scaler.inverse_transform(future_predictions)

    return future_predictions

# Example usage
future_steps = 30
future_predictions = predict_future(model, scaled_data, scaler, time_step, future_steps)

# Create future dates for plotting
last_date = df.index[-1]
future_dates = pd.date_range(start=last_date, periods=future_steps + 1, inclusive='right')

# Plotting future predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=future_dates, y=future_predictions[:, 0], mode='lines', name='Future Predict', line=dict(color='orange')))
fig.update_layout(title='Future Stock Price Prediction',
                   xaxis_title='Date',
                   yaxis_title='Stock Price',
                   template='plotly_dark')
fig.show()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90

In [30]:
# Save output in /kaggle/working
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir, 'yahoo_stock_processed.csv'))
# Save the model
model.save(os.path.join(output_dir, 'stock_price_lstm_model.h5'))

