# DATASET PREPARATION

## **SCRAPING TWEET SENTIMENT**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install snscrape
!pip install demoji
!pip install transformers
!pip install transformers[sentencepiece]

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import demoji
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
import attr
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

### **II. SCRAPING**

In [None]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#AAPL since:2022-01-01 until:2022-12-18 lang:en').get_items()):
    attributes_container.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content])
    
# Creating a dataframe to load the list
tweets_df_2022 = pd.DataFrame(attributes_container, columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"])

In [None]:
tweets_df_2022

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet
0,in_tradingview,2018-12-31 22:20:00+00:00,0,erased1143779,"#AAPL - BUY IF BREAKS 159.33 , target 161.71 -..."
1,HotHardware,2018-12-31 21:27:10+00:00,0,Twitter Web Client,The Real Reason For Soft #Apple #iPhone Demand...
2,BBSBRICK,2018-12-31 21:16:29+00:00,0,Twitter for iPhone,$SPY $QQQ $DIA #GDP #economy #aapl #googl http...
3,elliottwaves,2018-12-31 19:23:15+00:00,0,StockTwits Web,Group 3 instruments #Stocks &amp; #ETF's hourl...
4,elliottwaves,2018-12-31 19:21:05+00:00,0,StockTwits Web,Group 2 instruments hourly charts are updated ...
...,...,...,...,...,...
13706,StockMarketVP,2018-01-01 00:39:38+00:00,1,Twitter Web Client,GE Close To a Reversal Upward?\nhttps://t.co/1...
13707,StockMarketVP,2018-01-01 00:27:22+00:00,0,Twitter Web Client,BABA Has Lots of Air Under It\nhttps://t.co/1E...
13708,StockMarketVP,2018-01-01 00:23:00+00:00,0,Twitter Web Client,IBM Beginning a Run?\nhttps://t.co/1EGcnMdurR ...
13709,StockMarketVP,2018-01-01 00:14:03+00:00,0,Twitter Web Client,BIDU Holding Steady At a Neutral Bias\nhttps:/...


In [None]:
tweets_df_2022.isnull().sum()

User               0
Date Created       0
Number of Likes    0
Source of Tweet    0
Tweet              0
dtype: int64

### **III. NLP**

In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [None]:
# Applying text cleaning and then downloading it on the current folder
tweets_df_2022['cleaned'] = tweets_df_2022["Tweet"].apply(lambda row:clean_text(row))
tweets_df_2022.to_csv("CleanedNTweets2022.csv",index=False)

In [None]:
# Read the cleanedtweets file
tweets2022 = pd.read_csv("CleanedNTweets2022.csv")
tweets2022.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,cleaned
0,in_tradingview,2018-12-31 22:20:00+00:00,0,erased1143779,"#AAPL - BUY IF BREAKS 159.33 , target 161.71 -...",aapl buy break target tradingview
1,HotHardware,2018-12-31 21:27:10+00:00,0,Twitter Web Client,The Real Reason For Soft #Apple #iPhone Demand...,real reason soft apple iphone demand painfully...
2,BBSBRICK,2018-12-31 21:16:29+00:00,0,Twitter for iPhone,$SPY $QQQ $DIA #GDP #economy #aapl #googl http...,spy qqq dia gdp economy aapl googl
3,elliottwaves,2018-12-31 19:23:15+00:00,0,StockTwits Web,Group 3 instruments #Stocks &amp; #ETF's hourl...,group instrument stock etf's hourly chart upda...
4,elliottwaves,2018-12-31 19:21:05+00:00,0,StockTwits Web,Group 2 instruments hourly charts are updated ...,group instrument hourly chart update member vi...


In [None]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


('cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/sentencepiece.bpe.model',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer.json')

In [None]:
def polarity(text):
    encoded_input = tokenizer(str(text), return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    plrty = -1 if l == "negative" else 1 if l == "positive" else 0 
    s = np.round(float(scores[ranking[0]]), 4)
    return (l,plrty)

In [None]:
# downloading the file after applying sentiment analysis on the current folder
tweets2022['label'], tweets2022['Polarity'] = zip(*tweets2022['cleaned'].apply(lambda txt:polarity(txt)))
tweets2022.to_csv("polarizedTweets2022.csv",index=False)

## **SCRAPING APPLE STOCK PRICE**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install yfinance

In [3]:
import yfinance as yf

### **II. SCRAPING**

In [4]:
data_apple = yf.download(tickers="AAPL", start='2022-01-01', end='2022-12-18', interval='1d')
data_apple.to_csv('apple_stock.csv')

[*********************100%***********************]  1 of 1 completed


## **COMBINE APPLE STOCK PRICE WITH TWEET SENTIMENT**

In [5]:
ptweets_2022 = pd.read_csv("polarizedTweets2022.csv")
ptweets_2022

Unnamed: 0.1,Unnamed: 0,Date,Tweet,cleaned,label,Polarity
0,0,2022-12-17 11:48:25+00:00,#aapl $AAPL #apple Trend trigger daily 0% long...,aapl aapl apple trend trigger daily long weekl...,neutral,0
1,1,2022-12-17 11:12:42+00:00,Seeing some bullish divergence on the 1hr/2hr ...,see bullish divergence timeframes watch see ma...,neutral,0
2,2,2022-12-17 10:48:21+00:00,I was expecting this to happen. But I thought ...,expect happen thought liquidity would take fir...,neutral,0
3,3,2022-12-17 10:47:21+00:00,$AAPL seeing an uptick in chatter on wallstree...,aapl see uptick chatter wallstreetbets last hour,neutral,0
4,4,2022-12-17 09:37:04+00:00,Hotel? B &amp; B? Guesthouse? Add your website...,hotel guesthouse add website,neutral,0
...,...,...,...,...,...,...
16236,16236,2022-01-01 05:27:32+00:00,Happy New Year! Stay safe everyone and let’s g...,happy new year stay safe everyone let get read...,positive,1
16237,16237,2022-01-01 03:46:41+00:00,#CashAppyNewYear $bobbybrackets have a wonderf...,cashappynewyear bobbybrackets wonderful cashap...,positive,1
16238,16238,2022-01-01 02:45:44+00:00,"As we wrap up 2021, I am so grateful for all t...",wrap grateful wonderful people platform contin...,positive,1
16239,16239,2022-01-01 01:04:39+00:00,#freestock #AAPL #Tesla\nLIMITED TIME OFFER: G...,freestock aapl tesla limited time offer get fr...,neutral,0


In [8]:
ptweets_2022 = ptweets_2022.sort_values(by='Date').reset_index(drop=True)

In [9]:
ptweets_2022['Polarity'].value_counts()

 0    12734
-1     2454
 1     1053
Name: Polarity, dtype: int64

In [10]:
ptweets_df = ptweets_2022.loc[:,["Date","Polarity"]]
ptweets_df.head()

Unnamed: 0,Date,Polarity
0,2022-01-01 00:43:05+00:00,1
1,2022-01-01 01:04:39+00:00,0
2,2022-01-01 02:45:44+00:00,1
3,2022-01-01 03:46:41+00:00,1
4,2022-01-01 05:27:32+00:00,1


In [11]:
ptweets_df.tail()

Unnamed: 0,Date,Polarity
16236,2022-12-17 09:37:04+00:00,0
16237,2022-12-17 10:47:21+00:00,0
16238,2022-12-17 10:48:21+00:00,0
16239,2022-12-17 11:12:42+00:00,0
16240,2022-12-17 11:48:25+00:00,0


In [12]:
ptweets_df['Date'] = pd.to_datetime(ptweets_df['Date'],infer_datetime_format=True)

In [13]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2022-01-01 00:43:05+00:00,1
1,2022-01-01 01:04:39+00:00,0
2,2022-01-01 02:45:44+00:00,1
3,2022-01-01 03:46:41+00:00,1
4,2022-01-01 05:27:32+00:00,1
...,...,...
16236,2022-12-17 09:37:04+00:00,0
16237,2022-12-17 10:47:21+00:00,0
16238,2022-12-17 10:48:21+00:00,0
16239,2022-12-17 11:12:42+00:00,0


In [14]:
ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'].dt.strftime("%Y-%m-%d"))

In [15]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2022-01-01,1
1,2022-01-01,0
2,2022-01-01,1
3,2022-01-01,1
4,2022-01-01,1
...,...,...
16236,2022-12-17,0
16237,2022-12-17,0
16238,2022-12-17,0
16239,2022-12-17,0


In [16]:
Pol_df = pd.DataFrame(ptweets_df.groupby('Date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)

In [17]:
Pol_df['P_sum'] = ptweets_df.groupby('Date')['Polarity'].sum()

In [18]:
Pol_df['twt_count'] = ptweets_df.groupby('Date')['Polarity'].count()

In [19]:
Pol_df

Unnamed: 0_level_0,P_mean,P_sum,twt_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,0.450000,9,20
2022-01-02,-0.090909,-1,11
2022-01-03,0.020619,2,97
2022-01-04,0.051282,4,78
2022-01-05,-0.023256,-1,43
...,...,...,...
2022-12-13,-0.039474,-3,76
2022-12-14,-0.046875,-3,64
2022-12-15,0.024390,2,82
2022-12-16,-0.125000,-11,88


In [20]:
# Reading the apple finance data and preparing it to fit with the polarized values
apple_df = pd.read_csv('apple_stock.csv')
apple_df['Date'] = pd.to_datetime(apple_df['Date'],infer_datetime_format=True)
apple_df['Date'] = apple_df['Date'].dt.strftime("%Y-%m-%d")
apple_df['Date'] = pd.to_datetime(apple_df['Date'])
apple_df.set_index("Date")

# Adding the polarization column in the apple dataframe.
final_df = apple_df.join(Pol_df,on='Date',how="inner")
final_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2022-01-03,177.830002,182.880005,177.710007,182.009995,180.959732,104487900,0.020619,2,97
1,2022-01-04,182.630005,182.940002,179.119995,179.699997,178.663071,99310400,0.051282,4,78
2,2022-01-05,179.610001,180.169998,174.639999,174.919998,173.91066,94537600,-0.023256,-1,43
3,2022-01-06,172.699997,175.300003,171.639999,172.0,171.007507,96904000,-0.0625,-3,48
4,2022-01-07,172.889999,174.139999,171.029999,172.169998,171.176544,86709100,0.05,2,40


In [21]:
final_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
237,2022-12-12,142.699997,144.5,141.059998,144.490005,144.490005,70462700,-0.066667,-4,60
238,2022-12-13,149.5,149.970001,144.240005,145.470001,145.470001,93886200,-0.039474,-3,76
239,2022-12-14,145.350006,146.660004,141.160004,143.210007,143.210007,82291200,-0.046875,-3,64
240,2022-12-15,141.110001,141.800003,136.029999,136.5,136.5,98931900,0.02439,2,82
241,2022-12-16,136.690002,137.649994,133.729996,134.509995,134.509995,160080100,-0.125,-11,88


In [22]:
# Downloading the final CSV file that has the finance data and tweets polarizations
final_df.to_csv("Final_apple_stock_2022.csv",index=False)