# Twitter stock Prediction

## About Dataset

DataSet is taken from Kaggle - https://www.kaggle.com/datasets/maharshipandya/twitter-stocks-dataset?select=twitter-stocks.csv
This is a dataset of Twitter stock prices over a range of 9 years. The stock prices' date ranges from November 2013 to October 2022. The data is in CSV format which is tabular and can be loaded quickly.

## Column Description
There are 7 columns in this dataset.

Note: The currency is in USD ($)

- **Date**: The date for which the stock data is considered.
- **Open**: The stock's opening price on that day.
- **High**: The stock's highest price on that day.
- **Low**: The stock's lowest price on that day.
- **Close**: The stock's closing price on that day. The close price is adjusted for splits.
- **Adj Close**: Adjusted close price adjusted for splits and dividend and/or capital gain distributions.
- **Volume**: Volume measures the number of shares traded in a stock or contracts traded in futures or options.

# Import the Modules

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(42)

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

# Load the Dataset

In [None]:
df = pd.read_csv('/kaggle/input/twitter-stocks-dataset/twitter-stocks.csv')

In [None]:
df.head()

# Explore Twitter Data

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Data Cleaning

In [None]:
df['Date']= pd.to_datetime(df['Date'])

In [None]:
df["year"] = df["Date"].dt.year

In [None]:
print(df.Date.dtype)

In [None]:
df = df.set_index("Date")

In [None]:
df.head()

In [None]:
# Setting the Frequency to Business Days

print(df.shape)

In [None]:
Data = df.asfreq("b")

In [None]:
print(Data.shape)

In [None]:
Data.head()

In [None]:
Data.isnull().sum()

In [None]:
#Getting the Dates of Missing Values
null_data = Data[Data.isnull().any(axis=1) ]

In [None]:
null_data.head()

In [None]:
null_dates = null_data.index.tolist()

In [None]:
null_dates

In [None]:
# Filtering out the Holidays

import calendar
import datetime

holidays = []

for date in null_dates:
    week, day, month, year = date.weekday(), date.day, date.month, date.year
    week_day = calendar.day_name[week]

    if month==1:
        if day==1:
            # New year day
            holidays.append(date)
        elif day==2 and week_day=='Monday':
            # Observed New Year Day
            holidays.append(date)
        elif day>=15 and day<=21 and week_day=='Monday':
            # Martin Luther King, Jr. Day
            holidays.append(date)

    elif month==2:
        # Washington's Birthday
        if day>=15 and day<=21 and week_day=='Monday':
            holidays.append(date)

    elif month==5:
        # Memorial day
        if day>=25 and day<=31 and week_day=='Monday':
            holidays.append(date)

    elif month==7:
        # Independence day
        if day==4:
            holidays.append(date)
        # Observed Independence  Day
        elif day==5 and week_day=='Monday':
            holidays.append(date)
        elif day==3 and week_day=='Friday':
            holidays.append(date)

    elif month == 9:
        # Labour day
        if day>=1 and day<=7 and week_day=='Monday':
            holidays.append(date)

    elif month==11:
        # Thanksgiving Day
        if week_day=='Thursday' and day>=22 and day<=28:
            holidays.append(date)

    elif month==12:
        # Christmas Day
        if day==25:
            holidays.append(date)
        # Observed Christmas Day
        elif day==24 and week_day=='Friday':
            holidays.append(date)
        elif day==26 and week_day=='Monday':
            holidays.append(date)

good_fridays = [datetime.date(2014,4,18), datetime.date(2015,4,3), datetime.date(2016,3,25), datetime.date(2017,4, 14), datetime.date(2018,3, 30), datetime.date(2019,4, 19), datetime.date(2020,4, 10), datetime.date(2021,4, 2), datetime.date(2022,4, 15) ]
holidays = holidays + [pd.to_datetime(date) for date in good_fridays]

non_holidays = [x for x in null_dates if x not in holidays]
print(non_holidays)

In [None]:
modified_df = Data.drop(holidays)
print(modified_df.shape)

In [None]:
print("Before filling missing values:\n", modified_df.isnull().sum())

In [None]:
modified_df = modified_df.bfill(axis ='rows')

In [None]:
print("\nAfter filling missing values:\n",modified_df.isna().sum())

In [None]:
modified_df.head()

# Data Visualizing

In [None]:
modified_df.hist(bins = 50, figsize = (15,10))

In [None]:
plt.figure(figsize=(13,8))
sns.heatmap(df.corr(),annot=True)

In [None]:
modified_df.drop('Adj Close',axis=1,inplace=True)

In [None]:
modified_df

In [None]:
def plotter():
    global closing_stock
    
    company_close = modified_df.Close.values.astype('float32')
    company_close = company_close.reshape(-1, 1)
    closing_stock = company_close
    plt.xlabel('Time')
    plt.ylabel("Twitter close stock prices")
    plt.title('prices Vs Time')
    plt.grid(True)
    plt.plot(company_close , 'b')
    plt.show()

plotter()

In [None]:
plt.figure(figsize=(12, 6))

plt.plot(
    modified_df.groupby("year")["High"].max(),
    color="green",
    linewidth=2,
    label="High",
)
plt.plot(
    modified_df.groupby("year")["Low"].min(),
    color="red",
    linewidth=2,
    label="Low",
)
plt.plot(
    modified_df.groupby("year")["Open"].max(),
    color="green",
    linestyle="--",
    linewidth=1,
    label="Open",
)
plt.plot(
    modified_df.groupby("year")["Close"].min(),
    color="red",
    linestyle="--",
    linewidth=1,
    label="Close",
)

plt.xlabel("Years")
plt.ylabel("Values in $")
plt.title("Stock Prices by Year")
plt.legend()
plt.grid(True)
plt.show()

### **Splitting the Data**

In [None]:
n_train = int(len(closing_stock) * 0.80)
n_remaining = len(closing_stock) - n_train

n_val = int(n_remaining*0.50)
n_test = n_remaining - n_val 
print("Train samples:",n_train, "Validation Samples:",n_val,"Test Samples:", n_test)

In [None]:
train_data = closing_stock[0:n_train]
print(train_data.shape)

In [None]:
val_data = closing_stock[n_train:n_train+n_val]
print(val_data.shape)

In [None]:
test_data = closing_stock[n_train+n_val:]

print(test_data.shape)

### Feature Scaling

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))

In [None]:
train = scaler.fit_transform(train_data)

In [None]:
val = scaler.transform(val_data)

In [None]:
test = scaler.transform(test_data)

### Create the Datasets

In [None]:
train

In [None]:
def create_dataset(data , n_features):
    dataX, dataY = [], []
    for i in range(len(data)-n_features-1):
        a = data[i:(i+n_features), 0]
        dataX.append(a)
        dataY.append(data[i + n_features, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
n_features = 2
trainX, trainY = create_dataset(train, n_features)
valX, valY = create_dataset(val, n_features)
testX, testY = create_dataset(test, n_features)

In [None]:
print(trainX.shape , trainY.shape , valX.shape , valY.shape, testX.shape , testY.shape)

In [None]:
trainX = trainX.reshape(trainX.shape[0] , 1 ,trainX.shape[1])
valX = valX.reshape(valX.shape[0] , 1 ,valX.shape[1])
testX = testX.reshape(testX.shape[0] , 1 ,testX.shape[1])

In [None]:
print(trainX.shape , trainY.shape , valX.shape , valY.shape, testX.shape , testY.shape)

# Build the Model

In [None]:
model = keras.models.Sequential()

In [None]:
# First GRU layer
model.add(layers.GRU(units=100, return_sequences=True, input_shape=(1,n_features), activation='tanh'))
model.add(layers.Dropout(0.2))

# Second GRU layer
model.add(layers.GRU(units=150, return_sequences=True, input_shape=(1,n_features), activation='tanh'))
model.add(layers.Dropout(0.2))

# Third GRU layer
model.add(layers.GRU(units=100, activation='tanh'))
model.add(layers.Dropout(0.2))

# The output layer
model.add(layers.Dense(units=1, kernel_initializer='he_uniform', activation='linear'))

In [None]:
model.compile(loss='mean_squared_error', optimizer=Adam(lr = 0.0001) , metrics = ['mean_squared_error'])

In [None]:
print(model.summary())

In [None]:
history = model.fit(trainX,trainY,epochs=100,batch_size=128, verbose=1, validation_data = (valX,valY))

# Evaluate the Model

In [None]:
import math

def model_score(model, X_train, y_train, X_val, y_val , X_test, y_test):
    print('Train Score:')
    train_score = model.evaluate(X_train, y_train, verbose=0)
    print("MSE: {:.5f} , RMSE: {:.2f}".format(train_score[0], math.sqrt(train_score[0])))

    print('Validation Score:')
    val_score = model.evaluate(X_val, y_val, verbose=0)
    print("MSE: {:.5f} , RMSE: {:.2f}".format (val_score[0], math.sqrt(val_score[0])))

    print('Test Score:')
    test_score = model.evaluate(X_test, y_test, verbose=0)
    print("MSE: {:.5f} , RMSE: {:.2f}".format (test_score[0], math.sqrt(test_score[0])))


model_score(model, trainX, trainY ,valX, valY , testX, testY)

# Visualizing Loss vs Epochs

In [None]:
print(history.history.keys())

In [None]:
plt.plot(history.history['loss'])  # plotting train loss
plt.plot(history.history['val_loss'])  # plotting validation loss

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
pred = model.predict(testX)
pred = scaler.inverse_transform(pred)
print(pred[:10])

In [None]:
testY_actual = testY.reshape(testY.shape[0] , 1)
testY_actual = scaler.inverse_transform(testY_actual)
print(testY_actual[:10])

In [None]:
plt.plot(testY_actual , 'b')
plt.plot(pred , 'r')

plt.xlabel('Time')
plt.ylabel('Stock Prices')
plt.title('Check the performance of the model with time')
plt.legend(['Actual', 'Predicted'], loc='upper left')

plt.grid(True)
plt.show()