In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
import random
import pickle
import datetime
import os

from numpy.linalg import norm
from scipy.stats import rankdata

from tensorflow import keras
from tensorflow.keras import layers, optimizers
from tensorflow.keras.layers import Dense, Flatten, Dropout, BatchNormalization
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam, SGD
from keras.utils.vis_utils import plot_model

import matplotlib.pyplot as plt
from itertools import product

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/master/helper_functions')
import similarity_score_dicts
import evaluation

# Load TensorBoard
%load_ext tensorboard

# Load The Embeddings Dictionary

In [3]:
# Load embeddings dictionary from file
bert_embeddings_from_file = open('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/data/embeddings/bert_embeddings_textonly.pickle', 'rb')
loaded_embeddings = pickle.load(bert_embeddings_from_file)

In [4]:
print(len(loaded_embeddings))

90959


# Load The Word2Vec Embeddings Dictionary

In [5]:
#Load Word2Vec code embeddings dictionary from file
embeds_from_file = open('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/data/embeddings/word2vec_embeddings_top50.pickle', 'rb')
word2vec_embeddings_loaded = pickle.load(embeds_from_file)

In [6]:
print("length of dictionary:", len(word2vec_embeddings_loaded.keys()))
print("length of each embedding:", len(word2vec_embeddings_loaded[55661532]))
print("shape of each embedding:", word2vec_embeddings_loaded[55661532].shape)
print(word2vec_embeddings_loaded[55661532])

length of dictionary: 90959
length of each embedding: 13
shape of each embedding: (13,)
[ 0.02465418  0.08412779 -0.03654063 -1.0725514  -1.7244375   0.07843328
  0.24755026  0.899994    0.2392888  -0.91360664 -2.1623673   0.08816142
  1.4228041 ]


# Load The Q-A Dictionary

In [7]:
# Load q_a_dictionaries from file
train_from_file = open('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/data/q_a_dict_train.pickle', 'rb')
train_qa_dict = pickle.load(train_from_file)

# Load q_a_dictionaries from file
dev_from_file = open('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/data/q_a_dict_dev.pickle', 'rb')
dev_qa_dict = pickle.load(dev_from_file)

# Load q_a_dictionaries from file
test_from_file = open('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/data/q_a_dict_test.pickle', 'rb')
test_qa_dict = pickle.load(test_from_file)

# Concatenate Text and Code Embeddings for Questions and Answers

In [8]:
#Take question/answer text embedding and concatenate with question/answer code embedding
#loaded_embeddings dict = BERT embeddings
#word2vec_embeddings_loaded = code embeddings

combined_embeddings = {} #dictionary of concatenated text and code embeddings (868 dimension)

ids = list(loaded_embeddings.keys())

for id in ids:
  text_embedding = loaded_embeddings[id]
  code_embedding = word2vec_embeddings_loaded[id]
  concat = np.append(text_embedding, code_embedding)
  combined_embeddings[id] = concat


In [9]:
print(combined_embeddings[list(combined_embeddings.keys())[0]].shape)

(781,)


# Build train, dev, and test sets for concatenated question and answer embeddings

### Train

In [10]:
#Train set
#Concat question and answer

train_x = []   #list of concatenated embeddings (1736 dimension)
train_y = []   #list of answer labels (1 or 0's)

question_ids = list(train_qa_dict.keys())

for q_id in question_ids:
  for a_id in train_qa_dict[q_id]:
    concat = np.append(combined_embeddings[q_id], combined_embeddings[a_id[0]])
    train_x.append(concat)
    train_y.append(a_id[1])

print("length of train_x:", len(train_x))
print("length of each concatenated embedding:", len(train_x[0]))
print(train_x[0])
print(train_y)

train_x = np.array(train_x)
train_y = np.array(train_y)

length of train_x: 244980
length of each concatenated embedding: 1562
[0.38615552 0.01349972 0.20100826 ... 0.         0.         0.        ]
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

### Dev

In [11]:
#Dev set
#Concat question and answer (repeat for dev set)

dev_x = []   #list of concatenated embeddings (1736 dimension)
dev_y = []   #list of answer labels (1 or 0's)

question_ids = list(dev_qa_dict.keys())

for q_id in question_ids:
  for a_id in dev_qa_dict[q_id]:
    concat = np.append(combined_embeddings[q_id], combined_embeddings[a_id[0]])
    dev_x.append(concat)
    dev_y.append(a_id[1])

print("length of dev_x:", len(dev_x))
print("length of each concatenated embedding:", len(dev_x[0]))
print(dev_x[0])
print(dev_y)

dev_x = np.array(dev_x)
dev_y = np.array(dev_y)

length of dev_x: 30620
length of each concatenated embedding: 1562
[ 0.20946997 -0.07798999  0.24202554 ... -1.8828201  -1.2786789
 -0.98490626]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

### Test

In [12]:
#Test set
#Concat question and answer (repeat for test set)

test_x = []   #list of concatenated embeddings (1736 dimension)
test_y = []   #list of answer labels (1 or 0's)

question_ids = list(test_qa_dict.keys())

for q_id in question_ids:
  for a_id in test_qa_dict[q_id]:
    concat = np.append(combined_embeddings[q_id], combined_embeddings[a_id[0]])
    test_x.append(concat)
    test_y.append(a_id[1])

print("length of dev_x:", len(test_x))
print("length of each concatenated embedding:", len(test_x[0]))
print(test_x[0])
print(test_y)

test_x = np.array(test_x)
test_y = np.array(test_y)

length of dev_x: 30630
length of each concatenated embedding: 1562
[ 0.26147416  0.37381873  0.09622858 ... -0.43459567  1.0623089
  3.1512413 ]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# Build The Model

In [13]:
# Create model
def create_model(num_units):
  model = Sequential()
  model.add(Dense(num_units, input_dim = 1562, activation = 'relu'))
  model.add(Dense(2, activation = 'softmax'))
  return model

In [14]:
# Define hyperparameters
num_units = [1562, 768]
learning_rate = [0.0003, 0.0002, 0.0001]

hyperparams = list(product(num_units, learning_rate))
print(hyperparams)
print(len(hyperparams))

[(1562, 0.0003), (1562, 0.0002), (1562, 0.0001), (768, 0.0003), (768, 0.0002), (768, 0.0001)]
6


In [15]:
# Write hyperparams to a dataframe
hp = {
    'Number of Units': [i[0] for i in hyperparams],
    'Learning Rate': [i[1] for i in hyperparams]
}

hp_df = pd.DataFrame(hp, columns = ['Number of Units', 'Learning Rate'])
hp_df

Unnamed: 0,Number of Units,Learning Rate
0,1562,0.0003
1,1562,0.0002
2,1562,0.0001
3,768,0.0003
4,768,0.0002
5,768,0.0001


In [16]:
for u in num_units:
  model = create_model(u)
  # Plot the model; Pass in custom file path as a parameter
  plot_model(model, to_file = '/content/drive/MyDrive/Colab Notebooks/W266 Final Project/bert/model_bert_concatenated_v3_2/images/model_' + str(u) + '.png', show_shapes=True, show_layer_names=True)
  model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1562)              2441406   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 3126      
Total params: 2,444,532
Trainable params: 2,444,532
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 768)               1200384   
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1538      
Total params: 1,201,922
Trainable params: 1,201,922
Non-trainable params: 0
_________________________________________________________________


# Train The Model

In [17]:
def train_model(num_units, learning_rate):
  model = create_model(num_units)
  opt = optimizers.Adam(learning_rate = learning_rate)
  model.compile(optimizer = opt, loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

  # Modify to use your custom path for tensorboard
  logdir = os.path.join("/content/drive/MyDrive/Colab Notebooks/W266 Final Project/bert/model_bert_concatenated_v3_2/logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

  model.fit(train_x, train_y, validation_data=(dev_x, dev_y), epochs=1, batch_size=100, callbacks=[tensorboard_callback])

  return model

In [18]:
trained_models = []
count = 0

for params in hyperparams:
  count += 1
  print('Model ' + str(count) + ': [num_units = ' + str(params[0]) + ', learning_rate = ' + str(params[1]) + ']')
  model = train_model(params[0], params[1])
  trained_models.append((model, model.history))
  print()

Model 1: [num_units = 1562, learning_rate = 0.0003]

Model 2: [num_units = 1562, learning_rate = 0.0002]

Model 3: [num_units = 1562, learning_rate = 0.0001]

Model 4: [num_units = 768, learning_rate = 0.0003]

Model 5: [num_units = 768, learning_rate = 0.0002]

Model 6: [num_units = 768, learning_rate = 0.0001]



In [19]:
# Launch TensorBoard (Use your custom path)
%tensorboard --logdir '/content/drive/MyDrive/Colab Notebooks/W266 Final Project/bert/model_bert_concatenated_v3_2/logs/'

<IPython.core.display.Javascript object>

In [20]:
# Add training results into new columns in hyperparameter dataframe

num_epochs = 1 # modify this count to however many epochs you trained on

training_loss = []
training_accuracy = []
val_loss = []
val_accuracy = []

# Loop through the model histories
for hist in trained_models:
  training_loss.append(hist[1].history['loss'][num_epochs - 1])
  training_accuracy.append(hist[1].history['accuracy'][num_epochs - 1])
  val_loss.append(hist[1].history['val_loss'][num_epochs - 1])
  val_accuracy.append(hist[1].history['val_accuracy'][num_epochs - 1])

# Add new loss and accuracy columns to hyperparameter dataframe
hp_df['Training Loss'] = training_loss
hp_df['Training Accuracy'] = training_accuracy
hp_df['Validation Loss'] = val_loss
hp_df['Validation Accuracy'] = val_accuracy

In [22]:
hp_df

Unnamed: 0,Number of Units,Learning Rate,Training Loss,Training Accuracy,Validation Loss,Validation Accuracy
0,1562,0.0003,0.300422,0.899604,0.281182,0.9
1,1562,0.0002,0.301158,0.899514,0.28093,0.900359
2,1562,0.0001,0.30543,0.899486,0.289284,0.90049
3,768,0.0003,0.301359,0.899731,0.280399,0.900196
4,768,0.0002,0.301949,0.899845,0.285041,0.900229
5,768,0.0001,0.305826,0.899947,0.288659,0.900196


In [23]:
question_ids = list(dev_qa_dict.keys())
count = 0

val_mrr = []
val_precision = []
val_recall = []
val_accuracy_adjusted = []

# Loop through each model
for m in trained_models:
  count += 1
  print("Model " + str(count))
  print("----------")

  # Predict on dev data
  preds = m[0].predict(dev_x)

  #Creates a dictionary with keys that are question ids and values that are tuples of answer id and probability
  #{'question_id' => [10('answer_id', probability)]}
  predictions_dict = {}
  prediction_num = 0
  for q_id in question_ids:
    predictions_dict[q_id] = []
    for a_id in dev_qa_dict[q_id]:
      predictions_dict[q_id].append((a_id[0], preds[prediction_num][1]))
      prediction_num += 1
  
  # Compute rankings
  preds_ranking_dict = similarity_score_dicts.compute_rankings(predictions_dict)

  # Compute MRR
  mrr = evaluation.calculate_mrr(dev_qa_dict, preds_ranking_dict)

  # Compute precision, recall, and adjusted accuracy
  try:
    precision, recall, accuracy_adj = evaluation.calculate_metrics(dev_qa_dict, preds_ranking_dict)
  except ZeroDivisionError as e:
    print(e)
    precision = 0
    recall = 0
    accuracy_adj = 0

  val_mrr.append(mrr)
  val_precision.append(precision)
  val_recall.append(recall)
  val_accuracy_adjusted.append(accuracy_adj)
  print()

# Add new metric columns to hyperparameter dataframe
hp_df['Validation MRR'] = val_mrr
hp_df['Validation Precision'] = val_precision
hp_df['Validation Recall'] = val_recall
hp_df['Validation Accuracy Adjusted'] = val_accuracy_adjusted

Model 1
----------
MRR: 0.5500221610525324
TP: 1062
FP: 1995
TN: 25558
FN: 2005
Precision: 0.34739941118743867
Recall: 0.34626671014020216
Accuracy: 0.8693664271717831

Model 2
----------
MRR: 0.5483722642115835
TP: 1059
FP: 1999
TN: 25554
FN: 2008
Precision: 0.3463047743623283
Recall: 0.3452885555917835
Accuracy: 0.8691378184193338

Model 3
----------
MRR: 0.5412190393248507
TP: 1032
FP: 2027
TN: 25526
FN: 2035
Precision: 0.3373651520104609
Recall: 0.33648516465601563
Accuracy: 0.8673416067929458

Model 4
----------
MRR: 0.5459827584419349
TP: 1054
FP: 2004
TN: 25549
FN: 2013
Precision: 0.34466971877043817
Recall: 0.34365829801108577
Accuracy: 0.8688112344872633

Model 5
----------
MRR: 0.5412887624024127
TP: 1034
FP: 2025
TN: 25528
FN: 2033
Precision: 0.33801896044458973
Recall: 0.3371372676882948
Accuracy: 0.867472240365774

Model 6
----------
MRR: 0.5309998133806094
TP: 996
FP: 2062
TN: 25491
FN: 2071
Precision: 0.32570307390451275
Recall: 0.32474731007499186
Accuracy: 0.8650228608

In [24]:
hp_df

Unnamed: 0,Number of Units,Learning Rate,Training Loss,Training Accuracy,Validation Loss,Validation Accuracy,Validation MRR,Validation Precision,Validation Recall,Validation Accuracy Adjusted
0,1562,0.0003,0.300422,0.899604,0.281182,0.9,0.550022,0.347399,0.346267,0.869366
1,1562,0.0002,0.301158,0.899514,0.28093,0.900359,0.548372,0.346305,0.345289,0.869138
2,1562,0.0001,0.30543,0.899486,0.289284,0.90049,0.541219,0.337365,0.336485,0.867342
3,768,0.0003,0.301359,0.899731,0.280399,0.900196,0.545983,0.34467,0.343658,0.868811
4,768,0.0002,0.301949,0.899845,0.285041,0.900229,0.541289,0.338019,0.337137,0.867472
5,768,0.0001,0.305826,0.899947,0.288659,0.900196,0.531,0.325703,0.324747,0.865023


In [25]:
# Write hyperparamter dataframe to csv file
# Modify path to your custom path
hp_df.to_csv(r'/content/drive/MyDrive/Colab Notebooks/W266 Final Project/bert/model_bert_concatenated_v3_2/hp_results.csv', index = False, header = True)

# Save Models

In [26]:
count = 0

# Loop through each model and save using HDF5 format
# Modify path to your custom path
for m in trained_models:
  count += 1
  m[0].save('/content/drive/MyDrive/Colab Notebooks/W266 Final Project/bert/model_bert_concatenated_v3_2/models/model_' + str(count) + '.h5')

# Evaluate Best Model On Test Data



#### Evaluation Metrics

In [27]:
# Choose the best model from results table based on validation MRR, evaluate on test data, and compute metrics
# The second model has the highest MRR score: 0.663436

best_model = trained_models[0][0] # modify first index with the index of the best model in the results table
best_model.evaluate(test_x, test_y)



[0.28158432245254517, 0.8998693823814392]

In [28]:
preds = best_model.predict(test_x)
preds.shape

(30630, 2)

In [29]:
question_ids = list(test_qa_dict.keys())

#Creates a dictionary with keys that are question ids and values that are tuples of answer id and probability
#{'question_id' => [10('answer_id', probability)]}
predictions_dict = {}
prediction_num = 0
for q_id in question_ids:
  predictions_dict[q_id] = []
  for a_id in test_qa_dict[q_id]:
    predictions_dict[q_id].append((a_id[0], preds[prediction_num][1]))
    prediction_num += 1

In [30]:
preds_ranking_dict = similarity_score_dicts.compute_rankings(predictions_dict)

In [31]:
evaluation.calculate_mrr(test_qa_dict, preds_ranking_dict)

MRR: 0.5448380309790686


0.5448380309790686

In [32]:
evaluation.calculate_metrics(test_qa_dict, preds_ranking_dict)

TP: 1037
FP: 2024
TN: 25540
FN: 2029
Precision: 0.3387781770663182
Recall: 0.3382257012393999
Accuracy: 0.8676787463271303


(0.3387781770663182, 0.3382257012393999, 0.8676787463271303)