In [21]:
%%capture
#@title **Mount Google Drive**

from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [39]:
#@title **Import Statements**
import io
import numpy
import tensorflow
import pandas
import sklearn
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

In [23]:
#@title **Locate GPU**

device_name = tensorflow.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print("No GPU located.")
  raise SystemError('Halting execution.')

In [24]:
#@title **Import Dataset**
all_data = pandas.read_csv("/content/drive/MyDrive/DapperData/GoEmotionsComplete.csv")

emotion_labels = all_data.columns[1:29].tolist()
sentiment_labels = all_data.columns[29:].tolist()

In [25]:
#@title **Pre-Process Text**

all_data["text"] = all_data["text"].apply(lambda line : re.sub('[^a-zA-Z,\.\'"\!\?\(\)\[\]\s]', ' ', line))
all_data["text"] = all_data["text"].apply(lambda line : re.sub('\s+', ' ', line))

In [26]:
#@title **Split Data**
all_text = all_data['text'].values.astype(str)
all_emotions = all_data.iloc[:, 1:29].values
all_markers = all_data.iloc[:, 29:].values

x_train, x_test, y_train, y_test, z_train, z_test = train_test_split(all_text, all_emotions, all_markers, test_size = 0.2)

In [27]:
#@title **Create Tokenizer**

word_tokenizer = tensorflow.keras.preprocessing.text.Tokenizer()
word_tokenizer.fit_on_texts(x_train)

number_of_words = 9000

word_tokenizer.word_index = {word : index for word, index in word_tokenizer.word_index.items() if index <= number_of_words}
word_tokenizer.word_index[word_tokenizer.oov_token] = number_of_words + 1

In [28]:
#@title **Tokenize Data**
x_train = word_tokenizer.texts_to_sequences(x_train)
x_test = word_tokenizer.texts_to_sequences(x_test)

vocabulary_size = len(word_tokenizer.word_index) + 1

maximum_sequence_length = 30

x_train = tensorflow.keras.preprocessing.sequence.pad_sequences(x_train, padding = 'post', maxlen = maximum_sequence_length)
x_test = tensorflow.keras.preprocessing.sequence.pad_sequences(x_test, padding = 'post', maxlen = maximum_sequence_length)

In [29]:
#@title **Import and Create Embedding Matrix**
def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index) + 1
  embedding_matrix = numpy.zeros((vocab_size, embedding_dim))

  with open(filepath,encoding='utf-8') as f:
    for line in f:
        word, *vector = line.split()
        if word in word_index:
            idx = word_index[word]
            embedding_matrix[idx] = numpy.array(vector, dtype=numpy.float32)[:embedding_dim]
    return embedding_matrix

vectorizer_file_path = "/content/drive/MyDrive/DapperData/glove.6B.300d.txt"

embedding_dimension = 300

embedding_matrix = create_embedding_matrix(vectorizer_file_path, word_tokenizer.word_index, embedding_dimension)

In [30]:
#@title **Define Metrics For Models**

metrics = [tensorflow.metrics.BinaryAccuracy(name='accuracy'), tensorflow.metrics.MeanSquaredError(name='loss')]

In [31]:
#@title **Initialize Attention Layer**

class Attention(tensorflow.keras.layers.Layer):
  def __init__(self, return_sequences=True):
    self.return_sequences = return_sequences
    super(Attention,self).__init__()
      
  def build(self, input_shape):
    self.W = self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
    self.b = self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="zeros")
    super(Attention,self).build(input_shape)
  
  def call(self, x):
    e = tensorflow.keras.backend.tanh(tensorflow.keras.backend.dot(x,self.W)+self.b)
    a = tensorflow.keras.backend.softmax(e, axis=1)
    output = x*a
    
    if self.return_sequences:
      return output
    
    return tensorflow.keras.backend.sum(output, axis=1)

In [46]:
#@title **Make Bidirectional Long-Short Term Memory Network**
with tensorflow.device('/device:GPU:0'):
  lstm = tensorflow.keras.models.Sequential()
  lstm.add(tensorflow.keras.layers.Embedding(vocabulary_size, embedding_dimension, weights = [embedding_matrix], input_length = maximum_sequence_length, trainable = True))
  lstm.add(tensorflow.keras.layers.Bidirectional(tensorflow.keras.layers.LSTM(64, return_sequences = True)))
  lstm.add(tensorflow.keras.layers.Dropout(0.2))
  lstm.add(tensorflow.keras.layers.Bidirectional(tensorflow.keras.layers.LSTM(128, return_sequences = True)))
  lstm.add(tensorflow.keras.layers.Dropout(0.2))
  lstm.add(Attention(return_sequences=True))
  lstm.add(tensorflow.keras.layers.Bidirectional(tensorflow.keras.layers.LSTM(64, return_sequences = False)))
  lstm.add(tensorflow.keras.layers.Dense(3, activation='sigmoid'))
  lstm_optimizer = tensorflow.keras.optimizers.Adam(learning_rate = 0.0001)

  lstm.compile(loss = "binary_crossentropy", optimizer = lstm_optimizer, metrics = metrics)
  lstm.summary()

  lstm_callbacks = [tensorflow.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5)]

  lstm.fit(x_train, z_train, epochs = 50, verbose = True, callbacks = lstm_callbacks, validation_data = (x_test, z_test), batch_size = 128)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 30, 300)           2700600   
                                                                 
 bidirectional_9 (Bidirectio  (None, 30, 128)          186880    
 nal)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 30, 128)           0         
                                                                 
 bidirectional_10 (Bidirecti  (None, 30, 256)          263168    
 onal)                                                           
                                                                 
 dropout_9 (Dropout)         (None, 30, 256)           0         
                                                                 
 attention_3 (Attention)     (None, 30, 256)          

In [33]:
#@title **Make Convolutional Network**
with tensorflow.device('/device:GPU:0'):
  cnn = tensorflow.keras.models.Sequential()
  cnn.add(tensorflow.keras.layers.Embedding(vocabulary_size, embedding_dimension, weights = [embedding_matrix], input_length = maximum_sequence_length, trainable = True))
  cnn.add(tensorflow.keras.layers.Conv1D(256, 3, activation='relu'))
  cnn.add(tensorflow.keras.layers.Dropout(0.1))
  cnn.add(tensorflow.keras.layers.GlobalMaxPooling1D())
  cnn.add(tensorflow.keras.layers.Dense(28, activation='sigmoid'))
  cnn_optimizer = tensorflow.keras.optimizers.Adam(learning_rate = 0.0001)
  cnn.compile(optimizer = cnn_optimizer, loss = 'binary_crossentropy', metrics = metrics)
  cnn.summary()

  cnn_callbacks = [tensorflow.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5)]
  cnn.fit(x_train, y_train, epochs = 25, verbose = True, callbacks = cnn_callbacks, validation_data = (x_test, y_test), batch_size = 10)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 30, 300)           2700600   
                                                                 
 conv1d_1 (Conv1D)           (None, 28, 256)           230656    
                                                                 
 dropout_7 (Dropout)         (None, 28, 256)           0         
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 28)                7196      
                                                                 
Total params: 2,938,452
Trainable params: 2,938,452
Non-trainable params: 0
____________________________________________

In [40]:
cnn_predictions = cnn.predict(x_test)
cnn_predictions = (cnn_predictions > 0.25).astype(int)
cnn_f1_score = f1_score(y_test, cnn_predictions, average='micro')
cnn_precision = precision_score(y_test, cnn_predictions, average='micro')
cnn_recall = recall_score(y_test, cnn_predictions, average='micro')

print("CNN F1 Score:", cnn_f1_score)
print("CNN Precision:", cnn_precision)
print("CNN Recall:", cnn_recall)

# F1 Score Calculation for LSTM
lstm_predictions = lstm.predict(x_test)
lstm_predictions = (lstm_predictions > 0.425).astype(int)
lstm_f1_score = f1_score(z_test, lstm_predictions, average='micro')
lstm_precision = precision_score(z_test, lstm_predictions, average='micro')
lstm_recall = recall_score(z_test, lstm_predictions, average='micro')

print("LSTM F1 Score:", lstm_f1_score)
print("LSTM Precision:", lstm_precision)
print("LSTM Recall:", lstm_recall)

CNN F1 Score: 0.4307125816823981
LSTM F1 Score: 0.5295567103369196
CNN Precision: 0.3792295164309959
CNN Recall: 0.49836977176804753
LSTM Precision: 0.5213903101054502
LSTM Recall: 0.537982997496716


In [35]:
#@title **Prediction Function**
def most_common_emotions(emotions):
    all_emotions = [emotion for sublist in emotions for emotion in sublist]

    emotion_count = {}

    for emotion in all_emotions:
        if emotion in emotion_count:
            emotion_count[emotion] += 1
        else:
            emotion_count[emotion] = 1

    sorted_emotions = sorted(emotion_count, key = lambda x: emotion_count[x], reverse=True)

    return sorted_emotions[:3]

def model_predict(text):
  increment_value = 0.2
  processed_text = tensorflow.keras.preprocessing.sequence.pad_sequences(word_tokenizer.texts_to_sequences([text, ""]), padding = 'post', maxlen = maximum_sequence_length)

  emotion_prediction_values = cnn.predict(processed_text, verbose = 0)[0]
  sentiment_prediction_values = lstm.predict(processed_text, verbose = 0)[0]

  for i in range(3):
      if sentiment_prediction_values[i] >= 0.425:
        if i == 0: #positive
          emotion_prediction_values[0] += increment_value
          emotion_prediction_values[1] += increment_value
          emotion_prediction_values[4] += increment_value
          emotion_prediction_values[5] += increment_value
          emotion_prediction_values[8] += increment_value
          emotion_prediction_values[13] += increment_value
          emotion_prediction_values[15] += increment_value
          emotion_prediction_values[17] += increment_value
          emotion_prediction_values[18] += increment_value
          emotion_prediction_values[20] += increment_value
          emotion_prediction_values[21] += increment_value
          emotion_prediction_values[23] += increment_value
          
        elif i == 1: #ambiguous
          emotion_prediction_values[6] += increment_value / 2
          emotion_prediction_values[7] += increment_value / 2
          emotion_prediction_values[22] += increment_value / 2
          emotion_prediction_values[26] += increment_value / 2

        else: #negative
          emotion_prediction_values[2] += increment_value
          emotion_prediction_values[3] += increment_value
          emotion_prediction_values[9] += increment_value
          emotion_prediction_values[10] += increment_value
          emotion_prediction_values[11] += increment_value
          emotion_prediction_values[12] += increment_value
          emotion_prediction_values[14] += increment_value
          emotion_prediction_values[16] += increment_value
          emotion_prediction_values[19] += increment_value
          emotion_prediction_values[24] += increment_value
          emotion_prediction_values[25] += increment_value

  emotion_prediction = []

  for i in range(28):
      if emotion_prediction_values[i] >= 0.25:
        emotion_prediction.append(emotion_labels[i])

  return emotion_prediction

def model_predict_all(text):
  prediction_texts = list(filter(None, text.split(".")))

  all_predictions = []

  for prediction_text in prediction_texts:
    all_predictions.append(model_predict(prediction_text))

  return most_common_emotions(all_predictions)

In [36]:
#@title **Run Prediction Function On Trial Data**

user_data = pandas.read_csv("/content/drive/MyDrive/DapperData/userdata.csv", header = None, names=["text", "prediction #1", "prediction #2", "prediction #3"])

for index, row in user_data.iterrows():
  user_data.loc[index, "prediction #1"], user_data.loc[index, "prediction #2"], user_data.loc[index, "prediction #3"] = model_predict_all(row["text"])

In [37]:
user_data.to_csv("/content/drive/MyDrive/DapperData/processed_user_data.csv")