# DATASET PREPARATION

## **SCRAPING TWEET SENTIMENT**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install snscrape
!pip install demoji
!pip install transformers
!pip install transformers[sentencepiece]

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import demoji
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
import attr
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

### **II. SCRAPING**

In [3]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#AAPL since:2020-01-01 until:2021-01-01 lang:en').get_items()):
    attributes_container.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content])
    
# Creating a dataframe to load the list
tweets_df_2020 = pd.DataFrame(attributes_container, columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"])

In [4]:
tweets_df_2020

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet
0,aapltrdng,2020-12-31 23:24:42+00:00,5,Twitter for iPhone,"LETS BEGIN, BE BOLD AND VENTURE TO BE WISE,\nH..."
1,XtradesOfficial,2020-12-31 21:07:15+00:00,0,Xtrades,"Bad trading month of December, New Years resol..."
2,KDRod24,2020-12-31 20:54:16+00:00,0,Twitter for iPhone,"$AAPL If it breaks above 133.70, it’s on! 📈🚀🚀🚀..."
3,dailytradingapp,2020-12-31 19:57:34+00:00,0,dlvr.it,#AAPL #ACB Benzinga's 2020 Year In Review: Cor...
4,elliottwaves,2020-12-31 19:24:06+00:00,0,Twitter Web App,#Free Trial available at https://t.co/x2kmaKA6...
...,...,...,...,...,...
14412,jddwriter,2020-01-01 01:30:11+00:00,0,Hootsuite Inc.,Made one big mistake Day Trading Apple (AAPL) ...
14413,elliottwaves,2020-01-01 01:26:34+00:00,1,StockTwits Web,4 &amp; 1 hour charts for all 78 instruments a...
14414,elliottwaves,2020-01-01 01:24:41+00:00,0,StockTwits Web,4 &amp; 1 hour charts for the Group 3 instrume...
14415,iotafmarkets,2020-01-01 00:56:57+00:00,1,Twitter for iPhone,Top 10 trades of the decade: Number 4: Going l...


In [5]:
tweets_df_2020.isnull().sum()

User               0
Date Created       0
Number of Likes    0
Source of Tweet    0
Tweet              0
dtype: int64

### **III. NLP**

In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [11]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [12]:
# Applying text cleaning and then downloading it on the current folder
tweets_df_2020['cleaned'] = tweets_df_2020["Tweet"].apply(lambda row:clean_text(row))
tweets_df_2020.to_csv("CleanedNTweets2020.csv",index=False)

In [13]:
# Read the cleanedtweets file
tweets2020 = pd.read_csv("CleanedNTweets2020.csv")
tweets2020.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,cleaned
0,aapltrdng,2020-12-31 23:24:42+00:00,5,Twitter for iPhone,"LETS BEGIN, BE BOLD AND VENTURE TO BE WISE,\nH...",let begin bold venture wise happy new year red...
1,XtradesOfficial,2020-12-31 21:07:15+00:00,0,Xtrades,"Bad trading month of December, New Years resol...",bad trading month december new year resolution...
2,KDRod24,2020-12-31 20:54:16+00:00,0,Twitter for iPhone,"$AAPL If it breaks above 133.70, it’s on! 📈🚀🚀🚀...",aapl break chart increase rocket rocket rocket...
3,dailytradingapp,2020-12-31 19:57:34+00:00,0,dlvr.it,#AAPL #ACB Benzinga's 2020 Year In Review: Cor...,aapl acb benzinga's year review coronavirus ma...
4,elliottwaves,2020-12-31 19:24:06+00:00,0,Twitter Web App,#Free Trial available at https://t.co/x2kmaKA6...,free trial available


In [14]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


('cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/sentencepiece.bpe.model',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer.json')

In [15]:
def polarity(text):
    encoded_input = tokenizer(str(text), return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    plrty = -1 if l == "negative" else 1 if l == "positive" else 0 
    s = np.round(float(scores[ranking[0]]), 4)
    return (l,plrty)

In [16]:
# downloading the file after applying sentiment analysis on the current folder
tweets2020['label'], tweets2020['Polarity'] = zip(*tweets2020['cleaned'].apply(lambda txt:polarity(txt)))
tweets2020.to_csv("polarizedTweets2020.csv",index=False)

## **SCRAPING APPLE STOCK PRICE**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install yfinance

In [4]:
import yfinance as yf

### **II. SCRAPING**

In [5]:
data_apple = yf.download(tickers="AAPL", start='2020-01-01', end='2021-01-01', interval='1d')
data_apple.to_csv('apple_stock.csv')

[*********************100%***********************]  1 of 1 completed


## **COMBINE APPLE STOCK PRICE WITH TWEET SENTIMENT**

In [6]:
ptweets_2020 = pd.read_csv("polarizedTweets2020.csv")
ptweets_2020

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,cleaned,label,Polarity
0,aapltrdng,2020-12-31 23:24:42+00:00,5,Twitter for iPhone,"LETS BEGIN, BE BOLD AND VENTURE TO BE WISE,\nH...",let begin bold venture wise happy new year red...,positive,1
1,XtradesOfficial,2020-12-31 21:07:15+00:00,0,Xtrades,"Bad trading month of December, New Years resol...",bad trading month december new year resolution...,negative,-1
2,KDRod24,2020-12-31 20:54:16+00:00,0,Twitter for iPhone,"$AAPL If it breaks above 133.70, it’s on! 📈🚀🚀🚀...",aapl break chart increase rocket rocket rocket...,neutral,0
3,dailytradingapp,2020-12-31 19:57:34+00:00,0,dlvr.it,#AAPL #ACB Benzinga's 2020 Year In Review: Cor...,aapl acb benzinga's year review coronavirus ma...,neutral,0
4,elliottwaves,2020-12-31 19:24:06+00:00,0,Twitter Web App,#Free Trial available at https://t.co/x2kmaKA6...,free trial available,neutral,0
...,...,...,...,...,...,...,...,...
14412,jddwriter,2020-01-01 01:30:11+00:00,0,Hootsuite Inc.,Made one big mistake Day Trading Apple (AAPL) ...,make one big mistake day trading apple aapl mo...,negative,-1
14413,elliottwaves,2020-01-01 01:26:34+00:00,1,StockTwits Web,4 &amp; 1 hour charts for all 78 instruments a...,hour chart instrument available member view,neutral,0
14414,elliottwaves,2020-01-01 01:24:41+00:00,0,StockTwits Web,4 &amp; 1 hour charts for the Group 3 instrume...,hour chart group instrument available member view,neutral,0
14415,iotafmarkets,2020-01-01 00:56:57+00:00,1,Twitter for iPhone,Top 10 trades of the decade: Number 4: Going l...,top trade decade number go long faang stock be...,neutral,0


In [7]:
ptweets_2020.rename(columns={'Date Created':'Date'}, inplace=True)

In [8]:
ptweets_2020 = ptweets_2020[['Date', 'Tweet', 'cleaned','label', 'Polarity']]

In [9]:
ptweets_2020 = ptweets_2020.sort_values(by='Date').reset_index(drop=True)

In [10]:
ptweets_2020['Polarity'].value_counts()

 0    11042
-1     1742
 1     1633
Name: Polarity, dtype: int64

In [11]:
ptweets_df = ptweets_2020.loc[:,["Date","Polarity"]]
ptweets_df.head()

Unnamed: 0,Date,Polarity
0,2020-01-01 00:24:13+00:00,0
1,2020-01-01 00:56:57+00:00,0
2,2020-01-01 01:24:41+00:00,0
3,2020-01-01 01:26:34+00:00,0
4,2020-01-01 01:30:11+00:00,-1


In [12]:
ptweets_df.tail()

Unnamed: 0,Date,Polarity
14412,2020-12-31 19:24:06+00:00,0
14413,2020-12-31 19:57:34+00:00,0
14414,2020-12-31 20:54:16+00:00,0
14415,2020-12-31 21:07:15+00:00,-1
14416,2020-12-31 23:24:42+00:00,1


In [13]:
ptweets_df['Date'] = pd.to_datetime(ptweets_df['Date'],infer_datetime_format=True)

In [14]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2020-01-01 00:24:13+00:00,0
1,2020-01-01 00:56:57+00:00,0
2,2020-01-01 01:24:41+00:00,0
3,2020-01-01 01:26:34+00:00,0
4,2020-01-01 01:30:11+00:00,-1
...,...,...
14412,2020-12-31 19:24:06+00:00,0
14413,2020-12-31 19:57:34+00:00,0
14414,2020-12-31 20:54:16+00:00,0
14415,2020-12-31 21:07:15+00:00,-1


In [15]:
ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'].dt.strftime("%Y-%m-%d"))

In [16]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2020-01-01,0
1,2020-01-01,0
2,2020-01-01,0
3,2020-01-01,0
4,2020-01-01,-1
...,...,...
14412,2020-12-31,0
14413,2020-12-31,0
14414,2020-12-31,0
14415,2020-12-31,-1


In [17]:
Pol_df = pd.DataFrame(ptweets_df.groupby('Date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)

In [18]:
Pol_df['P_sum'] = ptweets_df.groupby('Date')['Polarity'].sum()

In [19]:
Pol_df['twt_count'] = ptweets_df.groupby('Date')['Polarity'].count()

In [20]:
Pol_df

Unnamed: 0_level_0,P_mean,P_sum,twt_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01,0.083333,1,12
2020-01-02,-0.046154,-3,65
2020-01-03,-0.173913,-4,23
2020-01-04,0.000000,0,10
2020-01-05,-0.285714,-2,7
...,...,...,...
2020-12-27,-0.187500,-3,16
2020-12-28,0.088235,3,34
2020-12-29,0.204545,9,44
2020-12-30,0.205882,7,34


In [21]:
Pol_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 366 entries, 2020-01-01 to 2020-12-31
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   P_mean     366 non-null    float64
 1   P_sum      366 non-null    int64  
 2   twt_count  366 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 11.4 KB


In [22]:
# Reading the apple finance data and preparing it to fit with the polarized values
apple_df = pd.read_csv('apple_stock.csv')
apple_df['Date'] = pd.to_datetime(apple_df['Date'],infer_datetime_format=True)
apple_df['Date'] = apple_df['Date'].dt.strftime("%Y-%m-%d")
apple_df['Date'] = pd.to_datetime(apple_df['Date'])
apple_df.set_index("Date")

# Adding the polarization column in the apple dataframe.
final_df = apple_df.join(Pol_df,on='Date',how="inner")
final_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2020-01-02,74.059998,75.150002,73.797501,75.087502,73.561531,135480400,-0.046154,-3,65
1,2020-01-03,74.287498,75.144997,74.125,74.357498,72.846359,146322800,-0.173913,-4,23
2,2020-01-06,73.447502,74.989998,73.1875,74.949997,73.426834,118387200,-0.225806,-7,31
3,2020-01-07,74.959999,75.224998,74.370003,74.597504,73.081505,108872000,-0.029412,-1,34
4,2020-01-08,74.290001,76.110001,74.290001,75.797501,74.257103,132079200,0.0,0,31


In [23]:
final_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
248,2020-12-24,131.320007,133.460007,131.100006,131.970001,130.404602,54930100,-0.032258,-1,31
249,2020-12-28,133.990005,137.339996,133.509995,136.690002,135.068588,124486200,0.088235,3,34
250,2020-12-29,138.050003,138.789993,134.339996,134.869995,133.270172,121047300,0.204545,9,44
251,2020-12-30,135.580002,135.990005,133.399994,133.720001,132.13385,96452100,0.205882,7,34
252,2020-12-31,134.080002,134.740005,131.720001,132.690002,131.116028,99116600,0.0,0,33


In [24]:
# Downloading the final CSV file that has the finance data and tweets polarizations
final_df.to_csv("Final_apple_stock_2020.csv",index=False)