# DATASET PREPARATION

## **SCRAPING TWEET SENTIMENT**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install snscrape
!pip install demoji
!pip install transformers
!pip install transformers[sentencepiece]

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import demoji
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
import attr
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

### **II. SCRAPING**

In [3]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#AAPL since:2021-01-01 until:2022-01-01 lang:en').get_items()):
    attributes_container.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content])
    
# Creating a dataframe to load the list
tweets_df_2021 = pd.DataFrame(attributes_container, columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"])

In [14]:
tweets_df_2021

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,cleaned
0,topstonks,2021-12-31 23:45:19+00:00,0,topstonks,$AAPL was the 11th most mentioned on wallstree...,aapl mention wallstreetbets last hour via
1,CrazySa27066746,2021-12-31 22:39:28+00:00,0,Twitter for iPhone,Happy new year everyone!! We will grow our kno...,happy new year everyone grow knowledge portfol...
2,AAPL_moves,2021-12-31 22:15:43+00:00,0,IFTTT,"Apple Inc price at close, 2021-12-31, is 177.5...",apple inc price close apple aapl
3,AAPLsilicon,2021-12-31 21:30:00+00:00,0,Twitter Web App,$AAPL closed today at $177.57.\n\nIf you bough...,aapl close today bought share aapl closing pri...
4,Buythediptrade,2021-12-31 20:51:59+00:00,1,Twitter for iPhone,If you have a fresh money which chart would yo...,fresh money chart would put money get well ret...
...,...,...,...,...,...,...
14693,apexgrowthpro,2021-01-01 20:19:48+00:00,2,Twitter Web App,"$AAL, $TSLA $AAPL $SHOP $SPY $QS $QQQ huge ord...",aal tsla aapl shop spy qqq huge order come fri...
14694,JasonSwitch7,2021-01-01 17:57:59+00:00,0,Twitter for Android,Join #Robinhood with my link and we'll both ge...,join robinhood link we'll get free stock
14695,KingYedidYahTRD,2021-01-01 15:38:28+00:00,1,Twitter for iPhone,Happy New Year !!! \n\nTrading #AAPL daily so ...,happy new year trading aapl daily get icar pro...
14696,RA_daniel_pu,2021-01-01 12:55:44+00:00,1,Twitter for iPhone,@NiemotkaMike @TraderNickyBAT Gains: #AAPL and...,gain aapl pltr loss amd sfix


In [5]:
tweets_df_2021.isnull().sum()

User               0
Date Created       0
Number of Likes    0
Source of Tweet    0
Tweet              0
dtype: int64

In [17]:
tweets_df_2021 = tweets_df_2021[['Date Created', 'Tweet']]

### **III. NLP**

In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [11]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [20]:
# Applying text cleaning and then downloading it on the current folder
tweets_df_2021['cleaned'] = tweets_df_2021["Tweet"].apply(lambda row:clean_text(row))
tweets_df_2021

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df_2021['cleaned'] = tweets_df_2021["Tweet"].apply(lambda row:clean_text(row))


Unnamed: 0,Date Created,Tweet,cleaned
0,2021-12-31 23:45:19+00:00,$AAPL was the 11th most mentioned on wallstree...,aapl mention wallstreetbets last hour via
1,2021-12-31 22:39:28+00:00,Happy new year everyone!! We will grow our kno...,happy new year everyone grow knowledge portfol...
2,2021-12-31 22:15:43+00:00,"Apple Inc price at close, 2021-12-31, is 177.5...",apple inc price close apple aapl
3,2021-12-31 21:30:00+00:00,$AAPL closed today at $177.57.\n\nIf you bough...,aapl close today bought share aapl closing pri...
4,2021-12-31 20:51:59+00:00,If you have a fresh money which chart would yo...,fresh money chart would put money get well ret...
...,...,...,...
14693,2021-01-01 20:19:48+00:00,"$AAL, $TSLA $AAPL $SHOP $SPY $QS $QQQ huge ord...",aal tsla aapl shop spy qqq huge order come fri...
14694,2021-01-01 17:57:59+00:00,Join #Robinhood with my link and we'll both ge...,join robinhood link we'll get free stock
14695,2021-01-01 15:38:28+00:00,Happy New Year !!! \n\nTrading #AAPL daily so ...,happy new year trading aapl daily get icar pro...
14696,2021-01-01 12:55:44+00:00,@NiemotkaMike @TraderNickyBAT Gains: #AAPL and...,gain aapl pltr loss amd sfix


In [21]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


('cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/sentencepiece.bpe.model',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer.json')

In [22]:
def polarity(text):
    encoded_input = tokenizer(str(text), return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    plrty = -1 if l == "negative" else 1 if l == "positive" else 0 
    s = np.round(float(scores[ranking[0]]), 4)
    return (l,plrty)

In [None]:
# downloading the file after applying sentiment analysis on the current folder
tweets2021['label'], tweets2021['Polarity'] = zip(*tweets2021['cleaned'].apply(lambda txt:polarity(txt)))
tweets2021.to_csv("polarizedTweets2021.csv",index=False)

## **SCRAPING APPLE STOCK PRICE**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install yfinance

In [25]:
import yfinance as yf

### **II. SCRAPING**

In [26]:
data_apple = yf.download(tickers="AAPL", start='2021-01-01', end='2022-01-01', interval='1d')
data_apple.to_csv('apple_stock.csv')

[*********************100%***********************]  1 of 1 completed


## **COMBINE APPLE STOCK PRICE WITH TWEET SENTIMENT**

In [27]:
ptweets_2021 = pd.read_csv("polarizedTweets2021.csv")
ptweets_2021

Unnamed: 0,Date,Tweet,cleaned,label,Polarity
0,2021-12-31 23:45:19+00:00,$AAPL was the 11th most mentioned on wallstree...,aapl mention wallstreetbets last hour,neutral,0
1,2021-12-31 22:39:28+00:00,Happy new year everyone!! We will grow our kno...,happy new year everyone grow knowledge portfol...,positive,1
2,2021-12-31 22:15:43+00:00,"Apple Inc price at close, 2021-12-31, is 177.5...",apple inc price close apple aapl,neutral,0
3,2021-12-31 21:30:00+00:00,$AAPL closed today at $177.57.\n\nIf you bough...,aapl close today bought share aapl closing pri...,neutral,0
4,2021-12-31 20:51:59+00:00,If you have a fresh money which chart would yo...,fresh money chart would put money get well ret...,neutral,0
...,...,...,...,...,...
14698,2021-01-01 20:19:48+00:00,"$AAL, $TSLA $AAPL $SHOP $SPY $QS $QQQ huge ord...",aal tsla aapl shop spy qqq huge order come fri...,neutral,0
14699,2021-01-01 17:57:59+00:00,Join #Robinhood with my link and we'll both ge...,join robinhood link we'll get free stock,neutral,0
14700,2021-01-01 15:38:28+00:00,Happy New Year !!! \n\nTrading #AAPL daily so ...,happy new year trading aapl daily get icar pro...,neutral,0
14701,2021-01-01 12:55:44+00:00,@NiemotkaMike @TraderNickyBAT Gains: #AAPL and...,gain aapl pltr loss amd sfix,neutral,0


In [28]:
ptweets_2021.rename(columns={'Date Created':'Date'}, inplace=True)

In [29]:
ptweets_2021 = ptweets_2021.sort_values(by='Date').reset_index(drop=True)

In [30]:
ptweets_2021['Polarity'].value_counts()

 0    11707
-1     1664
 1     1332
Name: Polarity, dtype: int64

In [31]:
ptweets_df = ptweets_2021.loc[:,["Date","Polarity"]]
ptweets_df.head()

Unnamed: 0,Date,Polarity
0,2021-01-01 05:40:03+00:00,0
1,2021-01-01 12:55:44+00:00,0
2,2021-01-01 15:38:28+00:00,0
3,2021-01-01 17:57:59+00:00,0
4,2021-01-01 20:19:48+00:00,0


In [32]:
ptweets_df.tail()

Unnamed: 0,Date,Polarity
14698,2021-12-31 20:51:59+00:00,0
14699,2021-12-31 21:30:00+00:00,0
14700,2021-12-31 22:15:43+00:00,0
14701,2021-12-31 22:39:28+00:00,1
14702,2021-12-31 23:45:19+00:00,0


In [33]:
ptweets_df['Date'] = pd.to_datetime(ptweets_df['Date'],infer_datetime_format=True)

In [34]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2021-01-01 05:40:03+00:00,0
1,2021-01-01 12:55:44+00:00,0
2,2021-01-01 15:38:28+00:00,0
3,2021-01-01 17:57:59+00:00,0
4,2021-01-01 20:19:48+00:00,0
...,...,...
14698,2021-12-31 20:51:59+00:00,0
14699,2021-12-31 21:30:00+00:00,0
14700,2021-12-31 22:15:43+00:00,0
14701,2021-12-31 22:39:28+00:00,1


In [35]:
ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'].dt.strftime("%Y-%m-%d"))

In [36]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2021-01-01,0
1,2021-01-01,0
2,2021-01-01,0
3,2021-01-01,0
4,2021-01-01,0
...,...,...
14698,2021-12-31,0
14699,2021-12-31,0
14700,2021-12-31,0
14701,2021-12-31,1


In [37]:
Pol_df = pd.DataFrame(ptweets_df.groupby('Date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)

In [38]:
Pol_df['P_sum'] = ptweets_df.groupby('Date')['Polarity'].sum()

In [39]:
Pol_df['twt_count'] = ptweets_df.groupby('Date')['Polarity'].count()

In [40]:
Pol_df

Unnamed: 0_level_0,P_mean,P_sum,twt_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01,0.000000,0,7
2021-01-02,0.333333,3,9
2021-01-03,0.000000,0,13
2021-01-04,0.113636,5,44
2021-01-05,0.121212,4,33
...,...,...,...
2021-12-27,0.047619,2,42
2021-12-28,0.000000,0,45
2021-12-29,0.000000,0,31
2021-12-30,0.000000,0,25


In [41]:
# Reading the apple finance data and preparing it to fit with the polarized values
apple_df = pd.read_csv('apple_stock.csv')
apple_df['Date'] = pd.to_datetime(apple_df['Date'],infer_datetime_format=True)
apple_df['Date'] = apple_df['Date'].dt.strftime("%Y-%m-%d")
apple_df['Date'] = pd.to_datetime(apple_df['Date'])
apple_df.set_index("Date")

# Adding the polarization column in the apple dataframe.
final_df = apple_df.join(Pol_df,on='Date',how="inner")
final_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2021-01-04,133.520004,133.610001,126.760002,129.410004,127.874924,143301900,0.113636,5,44
1,2021-01-05,128.889999,131.740005,128.429993,131.009995,129.455963,97664900,0.121212,4,33
2,2021-01-06,127.720001,131.050003,126.379997,126.599998,125.098282,155088000,0.156863,8,51
3,2021-01-07,128.360001,131.630005,127.860001,130.919998,129.36702,109578200,0.157895,9,57
4,2021-01-08,132.429993,132.630005,130.229996,132.050003,130.483627,105158200,0.054545,3,55


In [42]:
final_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
247,2021-12-27,177.089996,180.419998,177.070007,180.330002,179.289444,74919600,0.047619,2,42
248,2021-12-28,180.160004,181.330002,178.529999,179.289993,178.255447,79144300,0.0,0,45
249,2021-12-29,179.330002,180.630005,178.139999,179.380005,178.344925,62348900,0.0,0,31
250,2021-12-30,179.470001,180.570007,178.089996,178.199997,177.171738,59773000,0.0,0,25
251,2021-12-31,178.089996,179.229996,177.259995,177.570007,176.54538,64062300,0.052632,1,19


In [43]:
# Downloading the final CSV file that has the finance data and tweets polarizations
final_df.to_csv("Final_apple_stock_2021.csv",index=False)