In [2]:
import pandas as pd
from wordcloud import WordCloud,STOPWORDS
import spacy as sp
nlps = sp.load("en_core_web_sm")

In [16]:
# Load the data
dat = pd.read_csv("cleaned_data.csv")
dat.shape

(65666, 14)

In [17]:
dat.head()

Unnamed: 0.1,Unnamed: 0,TWEET,STOCK,DATE,LAST_PRICE,1_DAY_RETURN,2_DAY_RETURN,3_DAY_RETURN,7_DAY_RETURN,PX_VOLUME,VOLATILITY_10D,VOLATILITY_30D,LSTM_POLARITY,MENTION
0,0,RT @robertoglezcano: @amazon #Patents Show Fl...,Amazon,31/01/2017,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992,1.0,@amazon
1,1,@FAME95FM1 Jamaicans make money with @Payoneer...,PayPal,31/01/2017,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099,-1.0,@PayPal
2,2,@CBSi Jamaicans make money with @Payoneer @Pay...,PayPal,31/01/2017,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099,1.0,@PayPal
3,3,@Hitz92fm Jamaicans make money with @Payoneer ...,PayPal,31/01/2017,39.78,0.002011,0.012318,0.012318,0.054801,9100057.0,18.769,16.099,-1.0,@PayPal
4,4,RT @loadsofvans: Retweet this post &amp; follo...,Amazon,31/01/2017,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992,-1.0,@amazon


In [18]:
# For better illustration, change the name of column "LSTM_POLARITY" to "sentiments"
dat = dat.rename(columns={'LSTM_POLARITY': 'SENTIMENTS'})

**Create subsets of chosen companies: Amazon, Starbucks, Nike**

In [19]:
df_amazon = dat.loc[dat['STOCK'] == 'Amazon'].copy()
df_amazon = df_amazon[['TWEET', 'DATE', 'LAST_PRICE', 'VOLATILITY_10D', 'SENTIMENTS']]
df_amazon.shape

(4471, 5)

In [20]:
df_star = dat.loc[dat['STOCK'] == 'Starbucks'].copy()
df_star = df_star[['TWEET', 'DATE', 'LAST_PRICE', 'VOLATILITY_10D', 'SENTIMENTS']]
df_star.shape

(1413, 5)

In [21]:
df_nike = dat.loc[dat['STOCK'] == 'Nike'].copy()
df_nike = df_nike[['TWEET', 'DATE', 'LAST_PRICE', 'VOLATILITY_10D', 'SENTIMENTS']]
df_nike.shape

(7756, 5)

### Extract features from Tweets

In [None]:
!pip install transformers

In [32]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

**Create necessary functions**

In [39]:
# ESG features from FINBERT
def esg(df):
  finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
  tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
  nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
  esg_results = nlp(df["TWEET"].to_list())

  return esg_results

In [46]:
# FLS relavent fetures from FINBERT
def fls(df):
  finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
  tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
  nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
  fls_results = nlp(df["TWEET"].to_list())

  return fls_results

In [52]:
def add_feature(df):
  # the length of tweets
  df['# Of Words'] = df["TWEET"].apply(lambda x: len(x.split(' ')))

  # The number of stopwords in tweets
  df['# Of StopWords'] = df["TWEET"].apply(lambda x: len([word for word in x.split(' ') if word in list(STOPWORDS)]))

  # The number of currencies mentioned in tweets
  df['# Of Times Currency Was Mentioned'] = df["TWEET"].apply(lambda x: len([tok for tok in nlps(x).ents if tok.label_ == 'MONEY' ]))

  # The number of organizations mentioned in tweets
  df['# Of Organizations Mentioned'] = df["TWEET"].apply(lambda x: len([tok for tok in nlps(x).ents if tok.label_ == 'ORG' ]))

  # The number of tweets per day
  num_tweets = df['DATE'].value_counts()
  num_tweets = pd.DataFrame(num_tweets)
  num_tweets.reset_index(inplace = True)
  num_tweets.rename(columns = {'index':'DATE', 'DATE': 'Tweet_num'}, inplace = True)
  df = pd.merge(df, num_tweets, on='DATE', how='left')

  return df

**Amazon**

In [44]:
results = esg(df_amazon)
results

[{'label': 'Social', 'score': 0.8539798259735107},
 {'label': 'Social', 'score': 0.886387288570404},
 {'label': 'Social', 'score': 0.7622109651565552},
 {'label': 'Social', 'score': 0.9324904084205627},
 {'label': 'None', 'score': 0.7722222208976746},
 {'label': 'None', 'score': 0.5718774795532227},
 {'label': 'Environmental', 'score': 0.48079708218574524},
 {'label': 'None', 'score': 0.724492073059082},
 {'label': 'Social', 'score': 0.558263897895813},
 {'label': 'None', 'score': 0.7258944511413574},
 {'label': 'Social', 'score': 0.5385288596153259},
 {'label': 'None', 'score': 0.7280553579330444},
 {'label': 'Social', 'score': 0.886387288570404},
 {'label': 'Social', 'score': 0.9296273589134216},
 {'label': 'Social', 'score': 0.8097642064094543},
 {'label': 'None', 'score': 0.6894599795341492},
 {'label': 'Social', 'score': 0.9034162163734436},
 {'label': 'Social', 'score': 0.6497367024421692},
 {'label': 'Environmental', 'score': 0.5630893111228943},
 {'label': 'None', 'score': 0.59

In [43]:
df_amazon['ESG'] = results
df_amazon['ESG'] = df_amazon['ESG'].apply(lambda x: x['label'])
label_map = {'None': 0, "Environmental": 1, "Social": 2, "Governance": 3}
df_amazon["ESG"] = df_amazon["ESG"].map(label_map)

In [48]:
results2 = fls(df_amazon)
results2

[{'label': 'Not FLS', 'score': 0.9523255228996277},
 {'label': 'Not FLS', 'score': 0.9160388112068176},
 {'label': 'Not FLS', 'score': 0.9457177519798279},
 {'label': 'Not FLS', 'score': 0.9626376032829285},
 {'label': 'Not FLS', 'score': 0.9386681914329529},
 {'label': 'Not FLS', 'score': 0.9368821978569031},
 {'label': 'Not FLS', 'score': 0.9183132648468018},
 {'label': 'Not FLS', 'score': 0.9475831985473633},
 {'label': 'Not FLS', 'score': 0.8807168006896973},
 {'label': 'Not FLS', 'score': 0.9511891007423401},
 {'label': 'Not FLS', 'score': 0.9746822714805603},
 {'label': 'Not FLS', 'score': 0.9385780692100525},
 {'label': 'Not FLS', 'score': 0.9160388112068176},
 {'label': 'Not FLS', 'score': 0.6937950849533081},
 {'label': 'Not FLS', 'score': 0.8050178289413452},
 {'label': 'Not FLS', 'score': 0.972472608089447},
 {'label': 'Not FLS', 'score': 0.9487292170524597},
 {'label': 'Not FLS', 'score': 0.9460198879241943},
 {'label': 'Not FLS', 'score': 0.9264700412750244},
 {'label': 'N

In [49]:
df_amazon['FLG'] = results2
df_amazon['FLG'] = df_amazon['FLG'].apply(lambda x: x['label'])
label_map = {'Not FLS': 0, "Specific FLS": 1, "Non-specific FLS": 2}
df_amazon["FLG"] = df_amazon["FLG"].map(label_map)

In [53]:
add_feature(df_amazon)

Unnamed: 0,TWEET,DATE,LAST_PRICE,VOLATILITY_10D,SENTIMENTS,ESG,FLG,# Of Words,# Of StopWords,# Of Times Currency Was Mentioned,# Of Organizations Mentioned,Tweet_num
0,RT @robertoglezcano: @amazon #Patents Show Fl...,31/01/2017,823.48,13.447,1.0,2,0,18,0,0,1,117
1,RT @loadsofvans: Retweet this post &amp; follo...,31/01/2017,823.48,13.447,-1.0,2,0,21,5,2,1,117
2,RT @jhill1105: @loadsofvans Retweet this post ...,31/01/2017,823.48,13.447,-1.0,2,0,21,5,2,2,117
3,@amazon has your back when it comes to food sh...,31/01/2017,823.48,13.447,-1.0,2,0,19,8,0,0,117
4,Wish List https://t.co/tTBDy3czdm via @amazon,31/01/2017,823.48,13.447,1.0,0,0,5,0,0,1,117
...,...,...,...,...,...,...,...,...,...,...,...,...
4466,I really can’t believe same day Amazon Prime s...,29/09/2018,2003.00,20.709,-1.0,2,0,17,3,0,0,55
4467,Check out this Amazon deal: Funko POP Animatio...,29/09/2018,2003.00,20.709,-1.0,2,0,17,4,0,1,55
4468,RT @Kenneth00542118: Just saw this on Amazon: ...,30/09/2018,2003.00,20.709,1.0,0,0,18,4,1,4,66
4469,RT @JanetCBrennan: OUT NOW!\r\r\r\r\r\r\nBarne...,30/09/2018,2003.00,20.709,1.0,0,0,17,0,0,2,66


**Starbucks**

In [54]:
results = esg(df_star)

In [55]:
df_star['ESG'] = results
df_star['ESG'] = df_star['ESG'].apply(lambda x: x['label'])
label_map = {'None': 0, "Environmental": 1, "Social": 2, "Governance": 3}
df_star["ESG"] = df_star["ESG"].map(label_map)

In [56]:
results2 = fls(df_star)

In [57]:
df_star['FLG'] = results2
df_star['FLG'] = df_star['FLG'].apply(lambda x: x['label'])
label_map = {'Not FLS': 0, "Specific FLS": 1, "Non-specific FLS": 2}
df_star["FLG"] = df_star["FLG"].map(label_map)

In [58]:
add_feature(df_star)

Unnamed: 0,TWEET,DATE,LAST_PRICE,VOLATILITY_10D,SENTIMENTS,ESG,FLG,# Of Words,# Of StopWords,# Of Times Currency Was Mentioned,# Of Organizations Mentioned,Tweet_num
0,"RT @nikitakhara: Thank you, @Starbucks CEO for...",31/01/2017,55.22,23.916,1.0,2,0,19,5,0,1,187
1,RT @cultcommoncore: Dumping @Starbucks\r\r\r\r...,31/01/2017,55.22,23.916,-1.0,0,0,19,4,0,0,187
2,"RT @nia4_trump: So instead of hiring 10,000 un...",31/01/2017,55.22,23.916,-1.0,2,1,22,5,0,1,187
3,RT @RealGreek4Trump: #BoycottStarbucks @Starbu...,31/01/2017,55.22,23.916,1.0,2,0,18,1,1,1,187
4,RT @Starbucksnews: Message from Howard Schultz...,31/01/2017,55.22,23.916,1.0,2,0,16,3,0,1,187
...,...,...,...,...,...,...,...,...,...,...,...,...
1408,@angrygirl65 @Starbucks Clearly Seattle is Sta...,26/09/2018,57.27,16.290,-1.0,2,0,20,8,0,1,36
1409,RT @Starbucks: @chocohyoo Hyungwon will always...,26/09/2018,57.27,16.290,1.0,2,1,11,2,0,0,36
1410,RT @Starbucks: @chocohyoo Hyungwon will always...,29/09/2018,56.84,17.035,1.0,2,1,11,2,0,0,24
1411,@KenDilanianNBC @Starbucks If all you’re after...,29/09/2018,56.84,17.035,-1.0,2,0,16,7,0,1,24


**Nike**

In [59]:
results = esg(df_nike)

In [60]:
df_nike['ESG'] = results
df_nike['ESG'] = df_nike['ESG'].apply(lambda x: x['label'])
label_map = {'None': 0, "Environmental": 1, "Social": 2, "Governance": 3}
df_nike["ESG"] = df_nike["ESG"].map(label_map)

In [61]:
results2 = fls(df_nike)

In [62]:
df_nike['FLG'] = results2
df_nike['FLG'] = df_nike['FLG'].apply(lambda x: x['label'])
label_map = {'Not FLS': 0, "Specific FLS": 1, "Non-specific FLS": 2}
df_nike["FLG"] = df_nike["FLG"].map(label_map)

In [63]:
add_feature(df_nike)

Unnamed: 0,TWEET,DATE,LAST_PRICE,VOLATILITY_10D,SENTIMENTS,ESG,FLG,# Of Words,# Of StopWords,# Of Times Currency Was Mentioned,# Of Organizations Mentioned,Tweet_num
0,RT @Nike: Greatest ever. @serenawilliams #just...,31/01/2017,52.90,8.853,1.0,0,0,7,0,0,0,15
1,Shoe Dog: A Memoir by the Creator of @Nike htt...,31/01/2017,52.90,8.853,-1.0,2,0,14,3,0,2,15
2,@colettey6 @Nike I don't do any of those thing...,31/01/2017,52.90,8.853,-1.0,2,0,25,10,0,0,15
3,The unisex @Nike Sock Dart knocks your socks o...,31/01/2017,52.90,8.853,-1.0,1,0,23,7,0,1,15
4,Nice sentiment from @Nike. They retain my cust...,31/01/2017,52.90,8.853,1.0,2,0,9,2,0,0,15
...,...,...,...,...,...,...,...,...,...,...,...,...
7751,@FoxBusiness @Nike should have done it. I wi...,27/09/2018,84.54,20.623,-1.0,2,0,23,3,1,1,19
7752,RT @Reuters: Sales of Nike gear have soared 61...,27/09/2018,84.54,20.623,1.0,2,0,23,7,0,2,19
7753,Check out Vintage Nike Air Flight Warm Up Full...,27/09/2018,84.54,20.623,-1.0,2,0,21,1,0,0,19
7754,Check out Nike USA World Cup Away Jersey Kid's...,29/09/2018,84.72,20.470,-1.0,2,0,19,1,0,3,12


### Predict closing price using linear regression

In [64]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

**Create feature combinations function**

In [70]:
import itertools

def create_feature_combinations(features):
    feature_combinations = []
    for i in range(1, len(features)+1):
        for subset in itertools.combinations(features, i):
            feature_combinations.append(list(subset))
    return feature_combinations

**Amazon**

In [67]:
print(df_amazon.columns)

Index(['TWEET', 'DATE', 'LAST_PRICE', 'VOLATILITY_10D', 'SENTIMENTS', 'ESG',
       'FLG', '# Of Words', '# Of StopWords',
       '# Of Times Currency Was Mentioned', '# Of Organizations Mentioned'],
      dtype='object')


In [73]:
X = df_amazon[['ESG','FLG','# Of Words','# Of StopWords','# Of Times Currency Was Mentioned','# Of Organizations Mentioned']]
y = df_amazon['LAST_PRICE']

features = X.columns.tolist()
feature_comb = create_feature_combinations(X)

In [77]:
# Train and evaluate models
r2_scores = {}
for combination in feature_comb:
    X_subset = X[list(combination)]
    model = LinearRegression()
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    r2_scores[tuple(combination)] = round(scores.mean(), 4)

r2_scores

{('ESG',): -90.863,
 ('FLG',): -90.8876,
 ('# Of Words',): -90.6135,
 ('# Of StopWords',): -90.7312,
 ('# Of Times Currency Was Mentioned',): -91.2584,
 ('# Of Organizations Mentioned',): -92.7716,
 ('ESG', 'FLG'): -90.9031,
 ('ESG', '# Of Words'): -90.6324,
 ('ESG', '# Of StopWords'): -90.7476,
 ('ESG', '# Of Times Currency Was Mentioned'): -91.2741,
 ('ESG', '# Of Organizations Mentioned'): -92.795,
 ('FLG', '# Of Words'): -90.6716,
 ('FLG', '# Of StopWords'): -90.7843,
 ('FLG', '# Of Times Currency Was Mentioned'): -91.305,
 ('FLG', '# Of Organizations Mentioned'): -92.8004,
 ('# Of Words', '# Of StopWords'): -90.6227,
 ('# Of Words', '# Of Times Currency Was Mentioned'): -91.0044,
 ('# Of Words', '# Of Organizations Mentioned'): -92.4013,
 ('# Of StopWords', '# Of Times Currency Was Mentioned'): -91.1448,
 ('# Of StopWords', '# Of Organizations Mentioned'): -92.7109,
 ('# Of Times Currency Was Mentioned',
  '# Of Organizations Mentioned'): -92.8871,
 ('ESG', 'FLG', '# Of Words'): -

In [80]:
# Choose the best model with the largest R-squared
best_features = max(r2_scores, key=r2_scores.get)
best_score = r2_scores[best_features]

print("The best model of Amazon should include these features:", best_features, "with R-Squared of", best_score)

The best model of Amazon should include these features: ('# Of Words',) with R-Squared of -90.6135


**Starbucks**

In [81]:
X = df_star[['ESG','FLG','# Of Words','# Of StopWords','# Of Times Currency Was Mentioned','# Of Organizations Mentioned']]
y = df_star['LAST_PRICE']

features = X.columns.tolist()
feature_comb = create_feature_combinations(X)

In [82]:
# Train and evaluate models
r2_scores = {}
for combination in feature_comb:
    X_subset = X[list(combination)]
    model = LinearRegression()
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    r2_scores[tuple(combination)] = round(scores.mean(), 4)

In [83]:
# Choose the best model with the largest R-squared
best_features = max(r2_scores, key=r2_scores.get)
best_score = r2_scores[best_features]

print("The best model of Starbucks should include these features:", best_features, "with R-Squared of", best_score)

The best model of Starbucks should include these features: ('# Of StopWords',) with R-Squared of -2.5647


**Nike**

In [87]:
X = df_nike[['ESG','FLG','# Of Words','# Of StopWords','# Of Times Currency Was Mentioned','# Of Organizations Mentioned']]
y = df_nike['LAST_PRICE']

features = X.columns.tolist()
feature_comb = create_feature_combinations(X)

In [88]:
# Train and evaluate models
r2_scores = {}
for combination in feature_comb:
    X_subset = X[list(combination)]
    model = LinearRegression()
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    r2_scores[tuple(combination)] = round(scores.mean(), 4)

In [89]:
# Choose the best model with the largest R-squared
best_features = max(r2_scores, key=r2_scores.get)
best_score = r2_scores[best_features]

print("The best model of Nike should include these features:", best_features, "with R-Squared of", best_score)

The best model of Nike should include these features: ('# Of Organizations Mentioned',) with R-Squared of -9.678377079221854e+24


### Predict 10-day volatility using linear regression

**Amazon**

In [90]:
X = df_amazon[['ESG','FLG','# Of Words','# Of StopWords','# Of Times Currency Was Mentioned','# Of Organizations Mentioned']]
y = df_amazon['VOLATILITY_10D']

features = X.columns.tolist()
feature_comb = create_feature_combinations(X)

In [91]:
# Train and evaluate models
r2_scores = {}
for combination in feature_comb:
    X_subset = X[list(combination)]
    model = LinearRegression()
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    r2_scores[tuple(combination)] = round(scores.mean(), 4)

In [92]:
# Choose the best model with the largest R-squared
best_features = max(r2_scores, key=r2_scores.get)
best_score = r2_scores[best_features]

print("The best model of Amazon should include these features:", best_features, "with R-Squared of", best_score)

The best model of Amazon should include these features: ('# Of Times Currency Was Mentioned',) with R-Squared of -3.0033


**Starbucks**

In [93]:
X = df_star[['ESG','FLG','# Of Words','# Of StopWords','# Of Times Currency Was Mentioned','# Of Organizations Mentioned']]
y = df_star['VOLATILITY_10D']

features = X.columns.tolist()
feature_comb = create_feature_combinations(X)

In [94]:
# Train and evaluate models
r2_scores = {}
for combination in feature_comb:
    X_subset = X[list(combination)]
    model = LinearRegression()
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    r2_scores[tuple(combination)] = round(scores.mean(), 4)

In [95]:
# Choose the best model with the largest R-squared
best_features = max(r2_scores, key=r2_scores.get)
best_score = r2_scores[best_features]

print("The best model of Starbucks should include these features:", best_features, "with R-Squared of", best_score)

The best model of Starbucks should include these features: ('FLG', '# Of Words', '# Of Organizations Mentioned') with R-Squared of -4.1167


**Nike**

In [96]:
X = df_nike[['ESG','FLG','# Of Words','# Of StopWords','# Of Times Currency Was Mentioned','# Of Organizations Mentioned']]
y = df_nike['VOLATILITY_10D']

features = X.columns.tolist()
feature_comb = create_feature_combinations(X)

In [97]:
# Train and evaluate models
r2_scores = {}
for combination in feature_comb:
    X_subset = X[list(combination)]
    model = LinearRegression()
    scores = cross_val_score(model, X_subset, y, cv=5, scoring='r2')
    r2_scores[tuple(combination)] = round(scores.mean(), 4)

In [98]:
# Choose the best model with the largest R-squared
best_features = max(r2_scores, key=r2_scores.get)
best_score = r2_scores[best_features]

print("The best model of Nike should include these features:", best_features, "with R-Squared of", best_score)

The best model of Nike should include these features: ('ESG', '# Of Words') with R-Squared of -1.2273
