# DATASET PREPARATION

## **SCRAPING TWEET SENTIMENT**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install snscrape
!pip install demoji
!pip install transformers
!pip install transformers[sentencepiece]

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import demoji
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
import attr
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

### **II. SCRAPING**

In [3]:
# Creating list to append tweet data to
attributes_container = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#AAPL since:2019-01-01 until:2020-01-01 lang:en').get_items()):
    attributes_container.append([tweet.user.username, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content])
    
# Creating a dataframe to load the list
tweets_df_2019 = pd.DataFrame(attributes_container, columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"])

In [4]:
tweets_df_2019

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet
0,gothamtradx,2019-12-31 23:31:50+00:00,0,Twitter for iPhone,#AAPL wins for America https://t.co/Wkbh91401Q
1,aapltrdng,2019-12-31 23:14:57+00:00,6,Twitter for iPhone,HAPPY NEW YEAR FROM ALL @aapltrdng \n\n#aapl #...
2,ElliottForecast,2019-12-31 22:52:44+00:00,2,StockTwits Web,4 &amp; 1 hour charts for the Group 3 instrume...
3,AAPL_moves,2019-12-31 22:18:25+00:00,0,IFTTT,"Apple Inc price at close, 2019-12-31, is 293.6..."
4,ivan_labrie,2019-12-31 21:20:05+00:00,1,Twitter for Android,I covered shorts for now. Still in #AAPL short...
...,...,...,...,...,...
9902,Nancerelli2,2019-01-01 12:51:32+00:00,0,Twitter Web Client,#RT #ContestAlert #giveaway #contest #sweeps #...
9903,Nancerelli2,2019-01-01 12:51:16+00:00,0,Twitter Web Client,#RT #NewYearsEve #Giveaway #win #contest #Cont...
9904,craZ4apple,2019-01-01 05:56:25+00:00,0,Twitter for iPhone,"Been a tough year #Tim, hardware issues, softw..."
9905,elliottwaves,2019-01-01 01:33:59+00:00,0,StockTwits Web,Group 2 instruments 4 hour charts are updated ...


In [5]:
tweets_df_2019.isnull().sum()

User               0
Date Created       0
Number of Likes    0
Source of Tweet    0
Tweet              0
dtype: int64

### **III. NLP**

In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [11]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [12]:
# Applying text cleaning and then downloading it on the current folder
tweets_df_2019['cleaned'] = tweets_df_2019["Tweet"].apply(lambda row:clean_text(row))
tweets_df_2019.to_csv("CleanedNTweets2019.csv",index=False)

In [13]:
# Read the cleanedtweets file
tweets2019 = pd.read_csv("CleanedNTweets2019.csv")
tweets2019.head()

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,cleaned
0,gothamtradx,2019-12-31 23:31:50+00:00,0,Twitter for iPhone,#AAPL wins for America https://t.co/Wkbh91401Q,aapl win america
1,aapltrdng,2019-12-31 23:14:57+00:00,6,Twitter for iPhone,HAPPY NEW YEAR FROM ALL @aapltrdng \n\n#aapl #...,happy new year aapl aapltrdng newyear newyear
2,ElliottForecast,2019-12-31 22:52:44+00:00,2,StockTwits Web,4 &amp; 1 hour charts for the Group 3 instrume...,hour chart group instrument available member view
3,AAPL_moves,2019-12-31 22:18:25+00:00,0,IFTTT,"Apple Inc price at close, 2019-12-31, is 293.6...",apple inc price close apple aapl
4,ivan_labrie,2019-12-31 21:20:05+00:00,1,Twitter for Android,I covered shorts for now. Still in #AAPL short...,cover short still aapl short hold long tsla ex...


In [14]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


('cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/sentencepiece.bpe.model',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-xlm-roberta-base-sentiment/tokenizer.json')

In [15]:
def polarity(text):
    encoded_input = tokenizer(str(text), return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    plrty = -1 if l == "negative" else 1 if l == "positive" else 0 
    s = np.round(float(scores[ranking[0]]), 4)
    return (l,plrty)

In [16]:
# downloading the file after applying sentiment analysis on the current folder
tweets2019['label'], tweets2019['Polarity'] = zip(*tweets2019['cleaned'].apply(lambda txt:polarity(txt)))
tweets2019.to_csv("polarizedTweets2019.csv",index=False)

## **SCRAPING APPLE STOCK PRICE**

### **I. IMPORT LIBRARIES**

In [None]:
!pip install yfinance

In [19]:
import yfinance as yf

### **II. SCRAPING**

In [20]:
data_apple = yf.download(tickers="AAPL", start='2019-01-01', end='2020-01-01', interval='1d')
data_apple.to_csv('apple_stock.csv')

[*********************100%***********************]  1 of 1 completed


## **COMBINE APPLE STOCK PRICE WITH TWEET SENTIMENT**

In [31]:
ptweets_2019 = pd.read_csv("polarizedTweets2019.csv")
ptweets_2019

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweet,cleaned,label,Polarity
0,gothamtradx,2019-12-31 23:31:50+00:00,0,Twitter for iPhone,#AAPL wins for America https://t.co/Wkbh91401Q,aapl win america,positive,1
1,aapltrdng,2019-12-31 23:14:57+00:00,6,Twitter for iPhone,HAPPY NEW YEAR FROM ALL @aapltrdng \n\n#aapl #...,happy new year aapl aapltrdng newyear newyear,positive,1
2,ElliottForecast,2019-12-31 22:52:44+00:00,2,StockTwits Web,4 &amp; 1 hour charts for the Group 3 instrume...,hour chart group instrument available member view,neutral,0
3,AAPL_moves,2019-12-31 22:18:25+00:00,0,IFTTT,"Apple Inc price at close, 2019-12-31, is 293.6...",apple inc price close apple aapl,neutral,0
4,ivan_labrie,2019-12-31 21:20:05+00:00,1,Twitter for Android,I covered shorts for now. Still in #AAPL short...,cover short still aapl short hold long tsla ex...,neutral,0
...,...,...,...,...,...,...,...,...
9902,Nancerelli2,2019-01-01 12:51:32+00:00,0,Twitter Web Client,#RT #ContestAlert #giveaway #contest #sweeps #...,contestalert giveaway contest sweep win apple ...,neutral,0
9903,Nancerelli2,2019-01-01 12:51:16+00:00,0,Twitter Web Client,#RT #NewYearsEve #Giveaway #win #contest #Cont...,newyearseve giveaway win contest contestalert ...,neutral,0
9904,craZ4apple,2019-01-01 05:56:25+00:00,0,Twitter for iPhone,"Been a tough year #Tim, hardware issues, softw...",tough year tim hardware issue software issue p...,negative,-1
9905,elliottwaves,2019-01-01 01:33:59+00:00,0,StockTwits Web,Group 2 instruments 4 hour charts are updated ...,group instrument hour chart update member view...,neutral,0


In [32]:
ptweets_2019.rename(columns={'Date Created':'Date'}, inplace=True)

In [33]:
ptweets_2019 = ptweets_2019[['Date', 'Tweet', 'cleaned','label', 'Polarity']]

In [34]:
ptweets_2019 = ptweets_2019.sort_values(by='Date').reset_index(drop=True)

In [35]:
ptweets_2019['Polarity'].value_counts()

 0    7912
-1    1338
 1     657
Name: Polarity, dtype: int64

In [36]:
ptweets_df = ptweets_2019.loc[:,["Date","Polarity"]]
ptweets_df.head()

Unnamed: 0,Date,Polarity
0,2019-01-01 01:12:28+00:00,0
1,2019-01-01 01:33:59+00:00,0
2,2019-01-01 05:56:25+00:00,-1
3,2019-01-01 12:51:16+00:00,0
4,2019-01-01 12:51:32+00:00,0


In [37]:
ptweets_df.tail()

Unnamed: 0,Date,Polarity
9902,2019-12-31 21:20:05+00:00,0
9903,2019-12-31 22:18:25+00:00,0
9904,2019-12-31 22:52:44+00:00,0
9905,2019-12-31 23:14:57+00:00,1
9906,2019-12-31 23:31:50+00:00,1


In [38]:
ptweets_df['Date'] = pd.to_datetime(ptweets_df['Date'],infer_datetime_format=True)

In [39]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2019-01-01 01:12:28+00:00,0
1,2019-01-01 01:33:59+00:00,0
2,2019-01-01 05:56:25+00:00,-1
3,2019-01-01 12:51:16+00:00,0
4,2019-01-01 12:51:32+00:00,0
...,...,...
9902,2019-12-31 21:20:05+00:00,0
9903,2019-12-31 22:18:25+00:00,0
9904,2019-12-31 22:52:44+00:00,0
9905,2019-12-31 23:14:57+00:00,1


In [40]:
ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'].dt.strftime("%Y-%m-%d"))

In [41]:
ptweets_df

Unnamed: 0,Date,Polarity
0,2019-01-01,0
1,2019-01-01,0
2,2019-01-01,-1
3,2019-01-01,0
4,2019-01-01,0
...,...,...
9902,2019-12-31,0
9903,2019-12-31,0
9904,2019-12-31,0
9905,2019-12-31,1


In [42]:
Pol_df = pd.DataFrame(ptweets_df.groupby('Date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)

In [43]:
Pol_df['P_sum'] = ptweets_df.groupby('Date')['Polarity'].sum()

In [44]:
Pol_df['twt_count'] = ptweets_df.groupby('Date')['Polarity'].count()

In [45]:
Pol_df

Unnamed: 0_level_0,P_mean,P_sum,twt_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,-0.176471,-3,17
2019-01-02,-0.200000,-21,105
2019-01-03,-0.218354,-69,316
2019-01-04,-0.153846,-22,143
2019-01-05,-0.194444,-7,36
...,...,...,...
2019-12-27,-0.103448,-3,29
2019-12-28,0.000000,0,6
2019-12-29,0.000000,0,12
2019-12-30,-0.115385,-3,26


In [46]:
# Reading the apple finance data and preparing it to fit with the polarized values
apple_df = pd.read_csv('apple_stock.csv')
apple_df['Date'] = pd.to_datetime(apple_df['Date'],infer_datetime_format=True)
apple_df['Date'] = apple_df['Date'].dt.strftime("%Y-%m-%d")
apple_df['Date'] = pd.to_datetime(apple_df['Date'])
apple_df.set_index("Date")

# Adding the polarization column in the apple dataframe.
final_df = apple_df.join(Pol_df,on='Date',how="inner")
final_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
0,2019-01-02,38.7225,39.712502,38.557499,39.48,38.105133,148158800,-0.2,-21,105
1,2019-01-03,35.994999,36.43,35.5,35.547501,34.309586,365248800,-0.218354,-69,316
2,2019-01-04,36.1325,37.137501,35.950001,37.064999,35.774235,234428400,-0.153846,-22,143
3,2019-01-07,37.174999,37.2075,36.474998,36.982498,35.694611,219111200,-0.076923,-6,78
4,2019-01-08,37.389999,37.955002,37.130001,37.6875,36.375057,164101200,-0.085714,-6,70


In [47]:
final_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,P_mean,P_sum,twt_count
247,2019-12-24,71.172501,71.222504,70.730003,71.067497,69.623215,48478800,0.055556,1,18
248,2019-12-26,71.205002,72.495003,71.175003,72.477501,71.004578,93121200,-0.083333,-2,24
249,2019-12-27,72.779999,73.4925,72.029999,72.449997,70.977623,146266000,-0.103448,-3,29
250,2019-12-30,72.364998,73.172501,71.305,72.879997,71.398895,144114400,-0.115385,-3,26
251,2019-12-31,72.482498,73.419998,72.379997,73.412498,71.920578,100805600,-0.108108,-4,37


In [48]:
# Downloading the final CSV file that has the finance data and tweets polarizations
final_df.to_csv("Final_apple_stock_2019.csv",index=False)