## Extract Text Features

In [None]:
pip install transformers

In [None]:
import gc
import os
import csv
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/FML Project')

## Set Device

In [None]:
# set device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available.")
else:
    device = torch.device("cpu")
    print("Using CPU.")

GPU is available.


## Load Text Data

In [None]:
df = pd.read_csv(sys.path[-1]+'/Data/df_final.csv')
df.head()

Unnamed: 0,stock,target_date,image_name,tweet,label
0,AAPL,2014-01-09,AAPL@2014-01-09#0.png,$ aapl i love my ipad b / c no virus and i am ...,0
1,AAPL,2014-01-10,AAPL@2014-01-10#0.png,$ aapl please help me understand the math.bill...,0
2,AAPL,2014-01-14,AAPL@2014-01-14#1.png,$ aapl what's behind the swift rise in apple s...,1
3,AAPL,2014-01-15,AAPL@2014-01-15#1.png,$ aapl what's behind the swift rise in apple s...,1
4,AAPL,2014-01-16,AAPL@2014-01-16#0.png,$ aapl expect solid results and guidance from ...,0


In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
text_data = []
for i in tqdm(range(len(df))):
    tweets = df.iloc[i]['tweet'].split('<sep>')[:-1]
    tweets_concat = ' '.join(tweets)
    tokens = []
    for token in tweets_concat.split(' '):
        if token.isalpha() and token != 'URL' and token not in stop_words:
            tokens.append(token)
        if len(tokens) >= 256:
            break
    text_data.append(' '.join(tokens))

100%|██████████| 18543/18543 [00:06<00:00, 2840.96it/s]


In [None]:
# text_data_concat = []
# for tweets in tqdm(text_data):
#     tweet_concat = ' '.join(tweets)
#     text_data_concat.append(tweet_concat)

In [None]:
# text_data_concat_clipped = []
# for tweets in tqdm(text_data_concat):
#     tweet_concat = ' '.join(tweets.split(' ')[:256])
#     text_data_concat_clipped.append(tweet_concat)

In [None]:
text_data[0]

'aapl love ipad b c virus protected apple android open source full virus etc want car rt could interesting year technology long time aapl aapl video much app store mean apple aapl samsung report profit decline phone sales lag fslr aapl mu c yoku sd mrk v active equity options trading open full story aapl seeing potential hold higher low lod key trgt grind earnings aapl lows day rt apple reports billion worth apps sold aapl apple reports billion worth apps sold aapl crazy aapl apple announces app store sales topped billion stock stocks stockaction apple aapl rich fat happy ssnlf estimates decline profits sales amid stiff competition aapl know want hater iphone yousawitherefirst aapl rt samsung lowers guidance day micael bay walks stage ces promotional event need copy aapl choreography rt build mini portfolio shares aamc aapl amzn brk tprp wanna sell china better use chinese channels aapl opens store tmall rt us smpartphone penetration aapl august samsung share points end nov wait aapl a

In [None]:
len(text_data)

18543

In [None]:
len(text_data[0].split(' '))

256

## Download Pre-trained Models

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
finbert = AutoModel.from_pretrained("ProsusAI/finbert").to(device)
# finbert.save_pretrained(sys.path[-1]+"/Models/finbert")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# tokenizer.save_pretrained(sys.path[-1]+"/Models/finbert.tokenizer")

## Encode Text Data

In [None]:
encoded_text = tokenizer.batch_encode_plus(list(text_data),
                                           padding=True,
                                           truncation=True,
                                           max_length=256,
                                           return_tensors='pt').to(device)
print('Text data encoded!')

Text data encoded!


In [None]:
batch_size = 16
text_dataset = TensorDataset(encoded_text['input_ids'], encoded_text['attention_mask'])
text_loader = DataLoader(text_dataset, batch_size=batch_size)

In [None]:
import gc

text_features = []

finbert.eval()
for input_ids, attention_mask in tqdm(text_loader):
    with torch.no_grad():
        outputs = finbert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
    text_features.append(outputs.cpu())
    del outputs
    gc.collect()
    torch.cuda.empty_cache()

100%|██████████| 1159/1159 [03:49<00:00,  5.06it/s]


In [None]:
features = torch.cat(text_features, dim=0)
print('Text features shape: ', features.shape)

Text features shape:  torch.Size([18543, 256, 768])


In [None]:
torch.save(features, sys.path[-1]+"/Data/text_features_256.pt")