# Importing Libraries

In [1]:
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df1 = pd.read_csv("stock_tweets.csv")

In [3]:
df1.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [4]:
df1.columns

Index(['Date', 'Tweet', 'Stock Name', 'Company Name'], dtype='object')

In [5]:
df1.duplicated().sum()

0

In [6]:
df1.isnull().sum()

Date            0
Tweet           0
Stock Name      0
Company Name    0
dtype: int64

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80793 entries, 0 to 80792
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          80793 non-null  object
 1   Tweet         80793 non-null  object
 2   Stock Name    80793 non-null  object
 3   Company Name  80793 non-null  object
dtypes: object(4)
memory usage: 2.5+ MB


In [8]:
df1.nunique()

Date            64424
Tweet           64479
Stock Name         25
Company Name       25
dtype: int64

In [9]:
df1["Stock Name"].unique()

array(['TSLA', 'MSFT', 'PG', 'META', 'AMZN', 'GOOG', 'AMD', 'AAPL',
       'NFLX', 'TSM', 'KO', 'F', 'COST', 'DIS', 'VZ', 'CRM', 'INTC', 'BA',
       'BX', 'NOC', 'PYPL', 'ENPH', 'NIO', 'ZS', 'XPEV'], dtype=object)

In [10]:
df1["Stock Name"].value_counts()

TSLA    37422
TSM     11034
AAPL     5056
PG       4089
AMZN     4089
MSFT     4089
NIO      3021
META     2751
AMD      2227
NFLX     1727
GOOG     1291
PYPL      843
DIS       635
BA        399
COST      393
INTC      315
KO        310
CRM       233
XPEV      225
ENPH      216
ZS        193
VZ        123
BX         50
NOC        31
F          31
Name: Stock Name, dtype: int64

In [48]:
df1["Company Name"].value_counts()

Tesla, Inc.                                           37422
Taiwan Semiconductor Manufacturing Company Limited    11034
Apple Inc.                                             5056
Procter & Gamble Company                               4089
Amazon.com, Inc.                                       4089
Microsoft Corporation                                  4089
NIO Inc.                                               3021
Meta Platforms, Inc.                                   2751
Advanced Micro Devices, Inc.                           2227
Netflix, Inc.                                          1727
Alphabet Inc.                                          1291
PayPal Holdings, Inc.                                   843
The Walt Disney Company                                 635
The Boeing Company                                      399
Costco Wholesale Corporation                            393
Intel Corporation                                       315
The Coca-Cola Company                   

In [39]:
# Create a Plotly figure
fig = px.bar(df1['Stock Name'].value_counts(), 
             x=df1['Stock Name'].value_counts().index, 
             y=df1['Stock Name'].value_counts().values, 
             labels={'x': 'Stock Name', 'y': 'Count'},
             title='Countplot of Stock Names')

# Show the figure
fig.show()


In [49]:
fig = px.pie(df1, names='Company Name', title='Stock Name Distribution')
fig.update_layout(
    title='Stock Name Distribution',
    height=800,  
    width=1000    
)
fig.show()

<div class="alert alert-block alert-note">
<b>NOTE:</b> 
Based on the pie chart showing the distribution of tweets among different companies, several insights can be derived:

- **Dominance of Tesla Inc. (46.3%):** Tesla Inc. receives the highest proportion of tweets among all companies, indicating a significant level of public interest and attention towards Tesla's activities, products, and announcements. This suggests that Tesla is a highly discussed and influential company within the context of the stock market on social media platforms.

- **Taiwan Semiconductor Manufacturing Company Limited (14.6%):** Taiwan Semiconductor Manufacturing Company Limited is the second most talked-about company in the dataset. This could indicate a focus on semiconductor industry trends, given the company's significance in the global semiconductor manufacturing landscape.

- **Apple Inc. (6.26%):** Apple Inc. receives a notable proportion of tweets, reflecting the enduring popularity and market presence of the tech giant. Discussion around Apple may include product launches, financial performance, and industry developments.

- **Procter & Gamble Company, Amazon.com, and Microsoft Corporation (5.06% each):** These three companies receive a similar share of tweets, suggesting a balanced level of attention across different sectors. Procter & Gamble represents the consumer goods sector, Amazon.com is a major player in e-commerce and technology, and Microsoft Corporation is a leading software and cloud services provider. The consistent level of discussion around these companies may indicate ongoing market interest and activity in their respective industries.

- **Other Companies:** The remaining 20 companies collectively represent 22.76% of the tweets. While they may not individually receive as much attention as the top companies, their presence highlights a diverse range of stocks being discussed in the dataset. Further analysis could explore the specific trends and factors driving discussions around these companies.
</div>

In [50]:
df1['Date'] = pd.to_datetime(df1['Date'])

In [51]:
df1_copy = df1.copy()
df1_copy

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."
...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.
80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.
80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,XPEV,XPeng Inc.
80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.


The timestamp format "2022-09-29 23:41:16+00:00" adheres to the ISO 8601 standard for date and time representation, including a time zone offset. Here's a breakdown of its components:

1. "2022": Denotes the year.
2. "09": Represents the month, specifically September.
3. "29": Indicates the day of the month.
4. "23": Specifies the hour in 24-hour format, which in this case, corresponds to 11:00 PM.
5. "41": Represents the minute.
6. "16": Indicates the second.
7. "+00:00": This segment denotes the time zone offset. In this instance, it signifies Coordinated Universal Time (UTC), with no deviation from UTC. Therefore, the provided timestamp corresponds to September 29, 2022, at 11:41:16 PM (23:41:16) in the UTC time zone.






In [52]:
df1_copy.set_index('Date', inplace=True)

In [53]:
start_date = df1_copy.index.min()
end_date = df1_copy.index.max()

In [54]:
print("Start Date of Stocks:", start_date)
print("End Date of Stocks:", end_date)

Start Date of Stocks: 2021-09-30 00:06:02+00:00
End Date of Stocks: 2022-09-29 23:41:16+00:00


In [55]:
week_delta = pd.Timedelta(days=7)
current_date = start_date

In [58]:
def visualize_weekly_tweets(df):
    # Define timedelta object representing a week
    week_delta = pd.Timedelta(days=7)

    # Find start and end dates
    start_date = df.index.min()
    end_date = df.index.max()

    # Initialize current date
    current_date = start_date

    # Iterate through each week
    while current_date <= end_date:
        # Define start and end dates for the current week
        start_week = current_date
        end_week = current_date + week_delta

        # Filter data for the current week
        current_week_data = df[(df.index >= start_week) & (df.index < end_week)]

        # Resample data to get daily tweet counts
        weekly_counts = current_week_data.resample('D').size()

        # Create a bar plot using Plotly Express
        fig = px.bar(weekly_counts, x=weekly_counts.index, y=weekly_counts.values,
                     labels={'x': 'Date', 'y': 'Number of Tweets'},
                     title=f'Tweets for Week {start_week.strftime("%Y-%m-%d")} to {end_week.strftime("%Y-%m-%d")}')
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()

        # Move to the next week
        current_date += week_delta


In [59]:
visualize_weekly_tweets(df1_copy)

The `visualize_weekly_tweets` function takes a DataFrame containing tweet data with a datetime index as input. It iterates through each week within the dataset, filtering the tweet data for each week and computing the count of tweets per day using resampling. Subsequently, it generates a bar plot using Plotly Express to visualize the daily tweet counts for each week, with the x-axis representing dates and the y-axis representing the number of tweets. The function then moves on to the next week until it has covered the entire duration of the dataset, providing an overview of weekly tweet activity over time.

In [60]:
daily_counts = df1_copy.resample('D').size()
fig = go.Figure()
fig.add_trace(go.Bar(x=daily_counts.index, y=daily_counts.values,
                     marker_color='skyblue'))
fig.update_layout(title='Number of Tweets Day-wise',
                  xaxis_title='Date',
                  yaxis_title='Number of Tweets',
                  xaxis_tickangle=-45)
fig.show()