<a href="https://colab.research.google.com/github/sanspareilsmyn/mldl_sandbox/blob/main/lstm%2Bbert_sentiment_stockwits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
> ##### Sample input messages ######
> print(messages)
["$AMZN sick! they’re running a prime flash sale on shares too!", 
"$AAPL has a good Piotroski-F score of 7.00. This indicates a good health and profitability. https://www.chartmill.com/analyze.php?utm_source=stocktwits&amp;utm_medium=FA&amp;utm_content=PROFITABILITY&amp;utm_campaign=social_tracking#/AAPL?r=fa&amp;key=bb853040-a4ac-41c6-b549-d218d2f21b32", "$FB got rid of this trash today, 
i admit that bears were right", ...]
> print(sentiments)
[4, 2, 0, ...]
'''

In [2]:
# https://towardsdatascience.com/lstm-vs-bert-a-step-by-step-guide-for-tweet-sentiment-analysis-ced697948c47

In [3]:
#1. Preprocessing

In [None]:
import re

def preprocess(message):
  '''
  This function takes a string as input, then performs these operations:
  - lowercase
  - remove URLs
  - remove ticker symbols
  - remove punctuation
  - remove any single character tokens

  Parameters
  ----------
    message : The text message to be preprocessed

  Returns
  ----------
    text : The preprocessed text
  '''

  # Lowercase the messsage
  text = message.lower()
  # Replace URLs with a space in the message
  text = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', text)
  # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
  text = re.sub('\$[a-zA-Z0-9]*', ' ', text)
  # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
  text = re.sub('\@[a-zA-Z0-9]*', ' ', text)
  # Replace everything not a letter or apostrophe with a space
  text = re.sub('[^a-zA-Z\']', ' ', text)
  # Remove single letter words
  text = ' '.join( [w for w in text.split() if len(w) > 1] )

  return text

# Process for all messages
preprocessed = [preprocess(message) for message in tqdm(messages)]

In [None]:
'''
> ###### Input messages after preprocessing ######
> print(preprocessed)
["sick they re running a prime flash sale on shares too", 
"has a good piotroski f score of this indicates a good health and profitability", 
"got rid of this trash today i admit that bears were right", ...]
'''

In [3]:
#2. Tokenize

In [None]:
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def tokenize_text(text, option):
  '''
  Tokenize the input text as per specified option
    1. Use python split() function
    2. Use regex to extract alphabets plus 's and 't
    3. Use NLTK word_tokenize()
    4. Use NLTK word_tokenize(), remove stopwords and apply lemmatization
  '''

  if option == 1:
    return text.split()
  elif option == 2:
    return re.findall(r'\b([[a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', text)
  elif option == 3:
    return [word for word in word_tokenize(text) if (word.isalpha()==1)]
  elif option == 4:
    words = [word for word in word_tokenize(text) if (word.isalpha()==1)]
    # Remove stop words
    stop = set(stopwords.words('english'))
    words = [word for word in words if (word not in stop)]
    # Lemmatize words (first noun, then verb)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v') for word in words]
    return lemmatized
  else:
    logger.warn("Please specify option value between 1 and 4")
    return []

In [None]:
#3. Corpus and Vocab

In [None]:
def create_vocab(messages, show_graph=False):
  corpus = []
  for message in tqdm(messages, desc="Tokenizing"):
    tokens = tokenize_text(message, 3)
    corpus.extend(tokens)
  logger.info("The number of all words: {}".format(len(corpus)))

  # Create Counter
  counts = Counter(corpus)
  logger.info("Top 40 frequent words: {}".format(bow[:40]))

  # Indexing vocab, starting from 1
  vocab = {word : ii for ii, word in enumerate(counts, 1)}
  id2vocab = {v: k for k, v, in vocab.items()}

  if show_graph:
      from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
      # Generate Word Cloud image
      text = " ".join(corpus)
      stopwords = set(STOPWORDS)
      stopwords.update(["will", "report", "reporting", "market", "stock", "share"])

      wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=100, background_color="white", collocations=False).generate(text)
      plt.figure(figsize=(15,7))
      plt.imshow(wordcloud, interpolation="bilinear")
      plt.axis("off")
      plt.show()

      # Show most frequent words in a bar graph
      most = counts.most_common()[:80]
      x, y = [], []
      for word, count in most:
          if word not in stopwords:
              x.append(word)
              y.append(count)
      plt.figure(figsize=(12,10))
      sns.barplot(x=y, y=x)
      plt.show()

  return vocab

vocab= create_vocab(preprocessed, True)

In [None]:
#4. LSTM

In [None]:
from torch import nn

class LstmTextClassifier(nn.Module):
  def __init__(self, vocab_size, embed_size, lstm_size, dense_size, output_size, lstm_layers=2, dropout=0.1):
    super().__init__()
    self.vocab_size = vocab_size
    self.embed_size = embed_size
    self.lstm_size = lstm_size
    self.dense_size = dense_size
    self.output_size = output_size
    self.lstm_layers = lstm_layers
    self.dropout = dropout

    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, lstm_size, lstm_layers, dropout=dropout, batch_first=False)
    self.dropout = nn.Dropout(dropout)
    # Insert an additionall fully connected when combining with other inputs
    if dense_size == 0:
      self.fc = nn.Linear(lstm_size, output_size)
    else:
      self.fc1 = nn.Linear(lstm_size, dense_size)
      self.fc2 = nn.Linear(dense_size, output_size)

    self.softmax = nn.LogSoftmax(dim=1)

  def init_hidden(self, batch_size):
    '''
    Initialize the hidden state
    '''
    weight = next(self.parameters()).data
    hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_(),
              weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_())
    
    return hidden

  def forward(self, nn_input_text, hidden_state):
    '''
    Perform a forward pass of the model on nn_input
    '''
    batch_size = nn_input_text.size(0)
    nn_input_text = nn_input_text.long()
    embeds = self.embedding(nn_input_text)
    lstm_out, hidden_state = self.lstm(embeds, hidden_state)
    # Stack up LSTM outputs, apply dropout
    lstm_out = lstm_out[-1, :, :]
    lstm_out = self.dropout(lstm_out)
    # Insert an additional fully connected when combining with other inputs
    if self.dense_size == 0:
      out = self.fc(lstm_out)
    else:
      dense_out = self.fc1(lstm_out)
      out = self.fc2(dense_out)

    logps = self.softmax(out)

    return logps, hidden_state

# Define LSTM Tokenizer
def tokenizer_lstm(X, vocab, seq_len, padding):
  '''
  Returns tokenized tensor with left/right padding at the specified sequence length
  '''
  X_tmp = np.zeros((len(X), seq_len), dtype=np.int64)
  for i, text in enumerate(X):
    tokens = tokenize_text(text, 3)
    token_ids = [vocab[word] for word in tokens] 
    end_idx = min(len(token_ids), seq_len)
    if padding == 'right':
      X_tmp[i,:end_idx] = token_ids[:end_idx]
    elif padding == 'left':
      start_idx = max(seq_len - len(token_ids), 0)
      X_tmp[i,start_idx:] = token_ids[:end_idx]

  return torch.tensor(X_tmp, dtype=torch.int64)

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
#6. Dataset

In [None]:
from torch.utils.data import DataLoader, Dataset

# Define a DataSet Class which simply return (x, y) pair
class SimpleDataset(Dataset):
  def __init__(self, x, y):
    self.datalist = [(x[i], y[i]) for i in range(len(y))]
  def __len__(self):
    return len(self.datalist)
  def __getitem__(self, idx):
    return self.datalist[idx]

# Data Loader
def create_data_loader(X, y, indices, batch_size, shuffle):
  X_sampled = np.array(X, dtype=object)[indices]
  y_sampled = np.array(y)[indices].astype(int)
  dataset = SimpleDataset(X_sampled, y_sampled)
  loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
  return loader

In [None]:
#7. Sampling Cycle

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
import torch.nn.functional as F
from torch.autograd import Variable

def train_cycles(X_all, y_all, vocab, num_samples, model_type, epochs, patience, batch_size, seq_len, lr, clip, log_level):
  result = pd.DataFrame(colums=['Accuracy', 'F1(macro)', 'Total_Time', 'ms/text'], index=num_samples)

  for n in num_samples:
    print("")
    logger.info("############### Start training for %d samples ###############" %n)

    # Stratified sampling
    train_size = n / len(y_all)
    sss = StratifiedShuffleSplit(n_splits=1, train_size=train_size, test_size=train_size*0.2, random_state=rand_seed)
    train_indices, valid_indices = next(sss.split(X_all, y_all))

    # Sample input data
    train_loader = create_data_loader(X_all, y_all, train_indices, batch_size, True)
    valid_loader = create_data_loader(X_all, y_all, valid_indices, batch_size, False)

    if model_type == 'LSTM':
      model = LstmTextClassifier(len(vocab)+1, embed_size=512, lstm_size=1024, dense_size=0, output_size=5, lstm_layers=4, dropout=0.2)
      model.embedding.weight.data.uniform_(-1, 1)
    elif model_type == 'BERT':
      model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

    start_time = time.perf_counter()
    acc, f1, model_trained = train_nn_model(model, model_type, train_loader, valid_loader, vocab, epochs, patience, batch_size, seq_len, lr, clip, log_level)
    end_time = time.perf_counter()
    duration = end_time - start_time
    logger.info("Process Time (sec): {}".format(duration))
    result.loc[n] = (round(acc,4), round(f1,4), duration, duration/n*1000)

  return result, model_trained

# Define metrics
from sklearn.metrics import accuracy_score, f1_score

def metric(y_true, y_pred):
  acc = accuracy_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred, average='macro')
  return acc, f1

In [None]:
#8. Training the Neural Net Model

In [None]:
from transformers import AdamW as AdamW_HF, get_linear_schedule_with_warmup

