<a href="https://colab.research.google.com/github/samihahaha/CSIT696/blob/main/1_MainTeslaSentimentUsingTweetRoberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('tesla_sentiment.csv', encoding='utf-8')

In [None]:
df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch
import numpy as np


# Load model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to perform sentiment analysis
def sentiment_score(Tweet):
    encoded_Tweet = tokenizer(Tweet, return_tensors='pt')
    output = model(**encoded_Tweet)
    scores = softmax(output.logits, dim=1).detach().numpy()[0]
    # Mapping scores to sentiment labels
    labels = ['negative', 'neutral', 'positive']
    sentiment = labels[np.argmax(scores)]
    return sentiment, scores[np.argmax(scores)]

# Apply sentiment analysis
df['sentiment'], df['sentiment_score'] = zip(*df['Tweet'].apply(sentiment_score))


In [None]:
df.head()


Unnamed: 0,Date,Tweet,Stock Name,Company Name,sentiment,sentiment_score
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",negative,0.528979
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",neutral,0.719146
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",neutral,0.838153
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",negative,0.913476
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",negative,0.971596


In [None]:
df.to_csv('processed_tesla_sentiment.csv', index=False)


In [None]:
# Converting date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Indexing sentiment scores by date
daily_sentiment = df.groupby('Date')['sentiment_score'].mean().reset_index()


In [None]:
daily_sentiment.head()

Unnamed: 0,Date,sentiment_score
0,2021-09-30 01:16:13+00:00,0.761609
1,2021-09-30 01:38:26+00:00,0.787107
2,2021-09-30 01:59:02+00:00,0.788975
3,2021-09-30 02:40:26+00:00,0.628954
4,2021-09-30 02:52:38+00:00,0.880523


In [None]:
daily_sentiment.tail()

Unnamed: 0,Date,sentiment_score
37265,2022-09-29 22:27:05+00:00,0.971596
37266,2022-09-29 22:40:07+00:00,0.913476
37267,2022-09-29 23:18:08+00:00,0.838153
37268,2022-09-29 23:24:43+00:00,0.719146
37269,2022-09-29 23:41:16+00:00,0.528979


In [None]:
import yfinance as yf


In [None]:
ticker_symbol = 'TSLA'

start_date = '2021-09-30'

end_date = '2022-09-29'

tesla_stock = yf.download(ticker_symbol, start= start_date, end= end_date)

print(tesla_stock.head())

[*********************100%%**********************]  1 of 1 completed

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   
2021-10-01  259.466675  260.260010  254.529999  258.406677  258.406677   
2021-10-04  265.500000  268.989990  258.706665  260.510010  260.510010   
2021-10-05  261.600006  265.769989  258.066681  260.196655  260.196655   
2021-10-06  258.733337  262.220001  257.739990  260.916656  260.916656   

              Volume  
Date                  
2021-09-30  53868000  
2021-10-01  51094200  
2021-10-04  91449900  
2021-10-05  55297800  
2021-10-06  43898400  





In [None]:
tesla_stock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       251 non-null    datetime64[ns]
 1   Open       251 non-null    float64       
 2   High       251 non-null    float64       
 3   Low        251 non-null    float64       
 4   Close      251 non-null    float64       
 5   Adj Close  251 non-null    float64       
 6   Volume     251 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 13.9 KB


In [None]:



tesla_stock.to_csv('tesla_stock_price_for_sentiment_alignment.csv', index=True)

In [None]:
tesla_stock.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000
2021-10-01,259.466675,260.26001,254.529999,258.406677,258.406677,51094200
2021-10-04,265.5,268.98999,258.706665,260.51001,260.51001,91449900
2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800
2021-10-06,258.733337,262.220001,257.73999,260.916656,260.916656,43898400


In [None]:
tesla_stock.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-22,299.859985,301.290009,285.820007,288.589996,288.589996,70545400
2022-09-23,283.089996,284.5,272.820007,275.329987,275.329987,63748400
2022-09-26,271.829987,284.089996,270.309998,276.01001,276.01001,58076900
2022-09-27,283.839996,288.670013,277.51001,282.940002,282.940002,61925200
2022-09-28,283.079987,289.0,277.570007,287.809998,287.809998,54664800


In [None]:
tesla_stock.reset_index(inplace=True)

In [None]:
tesla_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000
1,2021-10-01,259.466675,260.26001,254.529999,258.406677,258.406677,51094200
2,2021-10-04,265.5,268.98999,258.706665,260.51001,260.51001,91449900
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800
4,2021-10-06,258.733337,262.220001,257.73999,260.916656,260.916656,43898400


In [None]:
tesla_stock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       251 non-null    datetime64[ns]
 1   Open       251 non-null    float64       
 2   High       251 non-null    float64       
 3   Low        251 non-null    float64       
 4   Close      251 non-null    float64       
 5   Adj Close  251 non-null    float64       
 6   Volume     251 non-null    int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 13.9 KB


In [None]:
daily_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37270 entries, 0 to 37269
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Date             37270 non-null  datetime64[ns, UTC]
 1   sentiment_score  37270 non-null  float32            
dtypes: datetime64[ns, UTC](1), float32(1)
memory usage: 436.9 KB


In [None]:
daily_sentiment['Date'] = pd.to_datetime(daily_sentiment['Date'])
aggregated_sentiment = daily_sentiment.groupby('Date')['sentiment_score'].mean().reset_index()

In [None]:
aggregated_sentiment.head()

NameError: name 'aggregated_sentiment' is not defined

In [None]:
tesla_stock['Date'] = pd.to_datetime(tesla_stock['Date'])


In [None]:
# Convert timezone-aware to timezone-naive by removing timezone
tesla_stock['Date'] = tesla_stock['Date'].dt.date
aggregated_sentiment['Date'] = aggregated_sentiment['Date'].dt.date




In [None]:
aggregated_sentiment.head()

Unnamed: 0,Date,sentiment_score
0,2021-09-30,0.761609
1,2021-09-30,0.787107
2,2021-09-30,0.788975
3,2021-09-30,0.628954
4,2021-09-30,0.880523


In [None]:
tesla_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000
1,2021-10-01,259.466675,260.26001,254.529999,258.406677,258.406677,51094200
2,2021-10-04,265.5,268.98999,258.706665,260.51001,260.51001,91449900
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800
4,2021-10-06,258.733337,262.220001,257.73999,260.916656,260.916656,43898400


In [None]:
merged_df = pd.merge(tesla_stock, aggregated_sentiment, on='Date', how='inner')


In [None]:
merged_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_score
0,2021-10-18,283.929993,291.753326,283.823334,290.036682,290.036682,72621600,0.874509
1,2021-10-25,316.843323,348.339996,314.733337,341.619995,341.619995,188556300,0.92377
2,2021-12-16,331.5,331.660004,307.283325,308.973328,308.973328,82771500,0.516029
3,2022-08-30,287.869995,288.480011,272.649994,277.700012,277.700012,50541800,0.956477


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29796 entries, 0 to 29795
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             29796 non-null  object 
 1   Open             29796 non-null  float64
 2   High             29796 non-null  float64
 3   Low              29796 non-null  float64
 4   Close            29796 non-null  float64
 5   Adj Close        29796 non-null  float64
 6   Volume           29796 non-null  int64  
 7   sentiment_score  29796 non-null  float32
dtypes: float32(1), float64(5), int64(1), object(1)
memory usage: 1.7+ MB


In [None]:
merged_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_score
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,0.761609
1,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,0.787107
2,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,0.788975
3,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,0.628954
4,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,0.880523


In [None]:
merged_df.to_csv('merged_sentiment_stock.csv', index=True)

In [None]:
import pandas as pd


In [None]:
merged_df = pd.read_csv('merged_sentiment_stock.csv')

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29796 entries, 0 to 29795
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       29796 non-null  int64  
 1   Date             29796 non-null  object 
 2   Open             29796 non-null  float64
 3   High             29796 non-null  float64
 4   Low              29796 non-null  float64
 5   Close            29796 non-null  float64
 6   Adj Close        29796 non-null  float64
 7   Volume           29796 non-null  int64  
 8   sentiment_score  29796 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 2.0+ MB


In [None]:
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df.set_index('Date', inplace=True)