In [37]:
!pip install streamlit



In [38]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import math
import time
import tensorflow as tf
from tensorflow.keras.layers import GRU, LSTM, Bidirectional, Dense, Flatten, Conv1D, BatchNormalization, LeakyReLU, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras.utils import plot_model
from pickle import load,dump
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import statsmodels.api as sm
from math import sqrt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata

import warnings
warnings.filterwarnings("ignore")

st.set_page_config(page_title="Stock Price Visualization")

st.title("Stock Closing Prices Over Time")

# file_path = os.path.join("..", "dataFiles", "stock_yfinance_data.csv")
file_path = "/content/stock_yfinance_data.csv"

df = pd.read_csv(file_path)
df["Date"] = pd.to_datetime(df["Date"]).dt.date
df = df.sort_values(by="Date")

st.subheader("Data Information")
buffer = st.expander("Show Data Info")
with buffer:
    df_info = pd.DataFrame({
        'Column': df.columns,
        'Non-Null Count': df.count()
    })
    df_info['Type'] = [str(dtype) for dtype in df.dtypes]
    st.dataframe(df_info)

st.subheader("Data Preview")
st.dataframe(df.head())

st.subheader("Stock Closing Prices")
fig, ax = plt.subplots(figsize=(10, 6))

for stock in df["Stock Name"].unique():
    stock_df = df[df["Stock Name"] == stock]
    ax.plot(stock_df["Date"], stock_df["Close"], label=stock)

ax.set_xlabel("Date")
ax.set_ylabel("Close Price")
ax.set_title("Stock Closing Prices Over Time")
ax.legend(loc="upper left")

st.pyplot(fig)

st.subheader("Stock Summary Statistics")
st.dataframe(df.groupby("Stock Name")["Close"].describe())



DeltaGenerator()

In [39]:
df.sample(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name
5983,2022-06-29,153.679993,156.149994,150.899994,154.949997,154.949997,1796800,ZS
5784,2022-09-14,21.469999,22.09,21.030001,21.940001,21.940001,47206700,NIO
2357,2022-02-07,122.559998,124.099998,121.879997,122.309998,120.404205,6562900,TSM


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6300 entries, 0 to 6299
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        6300 non-null   object 
 1   Open        6300 non-null   float64
 2   High        6300 non-null   float64
 3   Low         6300 non-null   float64
 4   Close       6300 non-null   float64
 5   Adj Close   6300 non-null   float64
 6   Volume      6300 non-null   int64  
 7   Stock Name  6300 non-null   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 443.0+ KB


In [41]:
df2 = pd.read_csv("/content/stock_tweets.csv")
df2.sample(3)

Unnamed: 0,Date,Tweet,Stock Name,Company Name
10657,2022-05-23 19:39:00+00:00,$TSLA has surpassed 1 million option contracts...,TSLA,"Tesla, Inc."
4866,2022-08-02 23:21:54+00:00,"Hey @elonmusk, I need my @tesla to have a draf...",TSLA,"Tesla, Inc."
38929,2022-05-02 20:08:06+00:00,The U.S. Senate Budget Committee will hold a h...,MSFT,Microsoft Corporation


In [42]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80793 entries, 0 to 80792
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          80793 non-null  object
 1   Tweet         80793 non-null  object
 2   Stock Name    80793 non-null  object
 3   Company Name  80793 non-null  object
dtypes: object(4)
memory usage: 2.5+ MB


In [43]:
all_tweets = df2

In [44]:
print(all_tweets.shape)
all_tweets.head()

(80793, 4)


Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


## Enter Stock name you want to analyze

In [45]:
df = all_tweets[all_tweets['Stock Name'].isin(['TSLA'])]

In [46]:
sent_df = df.copy()
sent_df["sentiment_score"] = ''
sent_df["Negative"] = ''
sent_df["Neutral"] = ''
sent_df["Positive"] = ''
sent_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,sentiment_score,Negative,Neutral,Positive
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",,,,
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",,,,
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",,,,
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",,,,
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",,,,


#### To get sentiment (polarity) scores, we use VADER (Valence Aware Dictionary for Sentiment Reasoning) model. VADER is a model used for text sentiment analysis that is sensitive to both polarity (positive/negative) and intensity (strength) of emotion. It is available in the NLTK package and can be applied directly to unlabeled text data.

In [47]:
for indx, row in sent_df.T.items():
    print(f"{indx} {row}")
    break

0 Date                                       2022-09-29 23:41:16+00:00
Tweet              Mainstream media has done an amazing job at br...
Stock Name                                                      TSLA
Company Name                                             Tesla, Inc.
sentiment_score                                                     
Negative                                                            
Neutral                                                             
Positive                                                            
Name: 0, dtype: object


In [48]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [49]:
%%time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata

sentiment_analyzer = SentimentIntensityAnalyzer()

# Loop over rows, no need to transpose the DataFrame
for indx, row in sent_df.iterrows():
    try:
        sentence_i = unicodedata.normalize('NFKD', row['Tweet'])
        sentence_sentiment = sentiment_analyzer.polarity_scores(sentence_i)

        sent_df.at[indx, 'sentiment_score'] = sentence_sentiment['compound']
        sent_df.at[indx, 'Negative'] = sentence_sentiment['neg']
        sent_df.at[indx, 'Neutral'] = sentence_sentiment['neu']
        sent_df.at[indx, 'Positive'] = sentence_sentiment['pos']

    except TypeError:
        print(f"Invalid Tweet at index {indx}: {row['Tweet']}")
        break


CPU times: user 28.1 s, sys: 257 ms, total: 28.4 s
Wall time: 40.6 s


In [50]:
sent_df['Date'] = pd.to_datetime(sent_df['Date'])
sent_df['Date'] = sent_df['Date'].dt.date
sent_df = sent_df.drop(columns=['Stock Name', 'Company Name'],errors='ignore')


In [51]:
twitter_df = sent_df.groupby('Date')[['sentiment_score', 'Negative', 'Neutral', 'Positive']].mean()
print(twitter_df.shape)

(365, 4)


### Data Set for sentiment analysis

In [52]:
twitter_df.sample(3)

Unnamed: 0_level_0,sentiment_score,Negative,Neutral,Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03-10,0.148545,0.056971,0.850431,0.092608
2021-11-19,0.177351,0.043146,0.841797,0.115073
2021-12-10,0.220661,0.045651,0.85289,0.101459
