In [3]:
# Import packages needed for text cleaning
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [4]:
# Load data 
tweets = pd.read_csv("tweets_formated.csv",
            lineterminator='\n')

news = pd.read_csv("news_formated.csv",
            lineterminator='\n')

In [5]:
# If we check the two dataframes, we see that 'news' contains more companies
# Example: AMAZ = AMZN, APPL = AAPL
tweets.groupby(['company'])['company'].count() # 10 companies
news.groupby(['company'])['company'].count() # 18 companies -> using wrong tic

company
AAPL     3157
ADBE     1804
ALPH      165
AMAZ      315
AMD      2448
AMZN     3190
APPL      316
ASML      928
AVGO     1776
AZN      1985
BROA       28
CMCSA    2092
COST     2306
CSCO     1951
FACE      306
GOOG     3077
META     1983
MICR      301
MSFT     3093
NFLX     2613
NVDA     2757
NVID      101
PACC        4
PCAR      377
PEP      1877
TESL      309
TMUS     1529
TSLA     3140
TXN      1287
Name: company, dtype: int64

In [6]:
# Check if the news articles are indeed about Amazon
news.loc[news['company'] == 'AMAZ']
news.loc[news['company'] == 'ALPH']
news.loc[news['company'] == 'APPL']
news.loc[news['company'] == 'BROA']
news.loc[news['company'] == 'FACE']
news.loc[news['company'] == 'MICR']
news.loc[news['company'] == 'NVID']
news.loc[news['company'] == 'PACC']
news.loc[news['company'] == 'TESL']

Unnamed: 0,company,date,content
1101,TESL,2022-12-01,"Tesla To Fix Rear Lighting Issue on Over 435,0..."
1102,TESL,2022-12-01,Musk set to finally take wraps off Tesla truck...
1103,TESL,2022-12-01,Here are the biggest calls on Wall Street on
1104,TESL,2022-12-01,Morgan Stanley bullish on Tesla heading in to ...
1105,TESL,2022-12-01,"Tesla to recall 435,000 China-made Model 3, Mo..."
...,...,...,...
1405,TESL,2023-03-01,U.S. NTSB cites speeding in fatal Tesla 2021 F...
1406,TESL,2023-02-01,Tesla said to boost output at Shanghai plant o...
1407,TESL,2023-02-01,Tesla directors to testify in 'funding secured...
1408,TESL,2023-02-01,In meetings with Saudi Arabia’s Public Investm...


In [7]:
# Rename to correct company names
news = news.replace("AMAZ", "AMZN")
news = news.loc[news["company"] != "ALPH"]
news = news.replace("APPL", "AAPL")
news = news.replace("BROA", "AVGO")
news = news.replace("FACE", "META")
news = news.replace("MICR", "MSFT")
news = news.replace("NVID", "NVDA")
news = news.replace("PACC", "PCAR")
news = news.replace("TESL", "TSLA")

tweets['company'] = tweets['company'].str[1:]

In [10]:
# Print ticker, company name, and number of tweets and news for Table
print(tweets.groupby(['company'])['company'].count())
print(news.groupby(['company'])['company'].count())

company
AAPL     2424
ADBE     2424
AMD      2424
AMZN     2424
ASML     2396
AVGO     2418
AZN      2420
CMCSA    2424
COST     2424
CSCO     2424
GOOG     2424
META     2064
MSFT     2424
NFLX     2424
NVDA     2424
PCAR     2155
PEP      2424
TMUS     2412
TSLA     2424
TXN      2418
Name: company, dtype: int64
company
AAPL     3473
ADBE     1804
AMD      2448
AMZN     3505
ASML      928
AVGO     1804
AZN      1985
CMCSA    2092
COST     2306
CSCO     1951
GOOG     3077
META     2289
MSFT     3394
NFLX     2613
NVDA     2858
PCAR      381
PEP      1877
TMUS     1529
TSLA     3449
TXN      1287
Name: company, dtype: int64


In [33]:
# Function to clean text, more specifically tweets
def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower()
    temp = re.sub("'", "", temp) # To avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # Removes @user
    temp = re.sub(r'http\S+', '', temp) # Removes http links
    temp = re.sub('[()!?]', ' ', temp) 
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split() 
    text = text.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    temp = " ".join(word for word in temp)
    return temp

tweets['content'] = tweets['content'].map(lambda x: clean_text(x))
news['content'] = news['content'].map(lambda x: clean_text(x))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if type(text) == np.float:


In [34]:
# Tokenize data
def tokenization(text):
    text = re.split('\W+', text)
    return text

tweets['tokenized'] = tweets['content'].apply(lambda x: tokenization(x.lower()))
news['tokenized'] = news['content'].apply(lambda x: tokenization(x.lower()))

In [37]:
# Removes stopwords
#nltk.download()
from nltk.corpus import stopwords
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
tweets['nonstop'] = tweets['tokenized'].apply(lambda x: remove_stopwords(x))
news['nonstop'] = news['tokenized'].apply(lambda x: remove_stopwords(x))

In [38]:
# Stemming the text
ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

tweets['stemmed'] = tweets['nonstop'].apply(lambda x: stemming(x))
news['stemmed'] = news['nonstop'].apply(lambda x: stemming(x))

In [41]:
# Loughram & McDonald sentiment analysis
import pysentiment2 as ps

lm = ps.LM()
tweets['polarity'] = tweets['stemmed'].apply(lm.get_score)
news['polarity'] = news['stemmed'].apply(lm.get_score)

In [42]:
# Change data structure
tweets = pd.concat(
    [tweets.drop(['polarity'], axis=1), 
     tweets['polarity'].apply(pd.Series)], axis=1)

news = pd.concat(
    [news.drop(['polarity'], axis=1), 
     news['polarity'].apply(pd.Series)], axis=1)

In [43]:
# Create new variable with sentiment "neutral," "positive" and "negative"
tweets['sentiment'] = tweets['Polarity'].apply(lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')
news['sentiment'] = news['Polarity'].apply(lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')

In [52]:
# Calculate sentiment mean for each day
tweets_polarity = tweets.groupby(["company", "date"])["Polarity"].mean()
news_ = news.groupby(["company", "date"])["Polarity"].mean()

In [53]:
# Change data stracture back 
tweets_sent = pd.DataFrame(tweets_polarity)
news_sent = pd.DataFrame(news_polarity)

tweets_sent = tweets_sent.reset_index()
news_sent = news_sent.reset_index()

In [54]:
# Save as CSV
tweets_sent.to_csv("tweets_sent.csv", index = False)
news_sent.to_csv("news_sent.csv", index = False)