In [None]:
import keras
import nltk
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv("qa_paris.csv")

In [None]:
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
q_vecs = bc.encode(df.question.tolist())
a_vecs = bc.encode(df.answer.tolist())
print(q_vecs.shape)
print(a_vecs.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
x = np.hstack([q_vecs, a_vecs])
y = df.label.values
x_trn, x_dev, y_trn, y_dev = train_test_split(
    x, y, stratify=y, test_size=0.2, random_state=66
)
print(x_trn.shape)

In [228]:
rfc = RandomForestClassifier(n_estimators=40)
rfc.fit(x_trn, y_trn)
y_pred = rfc.predict(x_dev)

In [None]:
lr = LogisticRegression(C=10)
lr.fit(x_trn, y_trn)
y_lr = lr.predict(x_dev)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_dev, y_pred)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_tfidf, precision_tfidf, 
                                                                       recall_tfidf, f1_tfidf))

In [None]:
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_dev, y_lr)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_tfidf, precision_tfidf, 
                                                                       recall_tfidf, f1_tfidf))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
y_predicted = lr.predict_proba(x_dev)
probs = y_predicted[:, 0]
# calculate AUC
auc = roc_auc_score(y_dev, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_dev, probs)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()

In [None]:
df.label.value_counts()

In [None]:
df = pd.read_excel("ys.xlsx")
df.shape

In [None]:
# create negative samples
df_one = df[['question', 'answer']]
df_one['label'] = 1
zero_questions = []
zero_answers = []
question_list = list(df.question)
answer_list = list(df.answer)
for i, question in enumerate(question_list):
    sampled_answers = np.random.choice(answer_list, 20, replace=False)
    for j in range(20):
        if answer_list[i] != sampled_answers[j]:
            zero_questions.append(question)
            zero_answers.append(sampled_answers[j])
            
df_zero = pd.DataFrame({'question': zero_questions,
                       'answer': zero_answers,
                       'label': 0})

In [None]:
df_total = pd.concat([df_one, df_zero], ignore_index=True)
df_total.shape

In [None]:
# split some for testing
from sklearn.model_selection import train_test_split

df_trn, df_te, _, _ = train_test_split(
    df_total, df_total, stratify=df_total.label,
    test_size=0.15, random_state=99
)
print(f"there are {df_trn.shape[0]} of training samples")
print(f"there are {df_te.shape[0]} of testing samples")

In [None]:
# get the unique answers for prediction use
unique_answers = df_total.answer.unique()

In [109]:
unique_answers.shape

(281,)

In [None]:
# over sample positive samples
trn_ones = df_trn[df_trn['label'] == 1]
# add 6 times of positive samples to the training
trn_oversample = pd.concat([trn_ones for _ in range(6)], axis=0, ignore_index=True)
df_trn = pd.concat([df_trn, trn_oversample], axis=0, ignore_index=True)
# after adding positive samples, do the shuffling
df_trn = df_trn.sample(frac=1.0)
print(f"there are {df_trn.shape[0]} of training samples")


In [None]:
df_trn.to_csv('trn_oversampled.csv')

In [None]:
df_te.to_csv('te_corresponed.csv')

In [107]:
un_ans_embs = bc.encode(unique_answers.tolist())

In [None]:
# use bert to calculate question embeddings and answer embeddings
# trn_q_embs = bc.encode(df_trn.question.tolist())
trn_a_embs = bc.encode(df_trn.answer.tolist())
# te_q_embs = bc.encode(df_te.question.tolist())
te_a_embs = bc.encode(df_te.answer.tolist())

In [110]:
ans_idx = {ans: i for i, ans in enumerate(unique_answers.tolist())}

In [114]:
import pickle
with open("ans_to_idx.pkl", "wb") as f:
    pickle.dump(ans_idx, f)

In [240]:
len(un_ans_embs)

281

In [157]:
ans_list = unique_answers.tolist()

In [115]:
np.save("unique_ans_embs.npy", un_ans_embs)

In [118]:
trn_ans_embs = np.array([un_ans_embs[ans_idx[a]] for a in df_trn.answer])
te_ans_embs = np.array([un_ans_embs[ans_idx[a]] for a in df_te.answer])
print(trn_ans_embs.shape)

(7185, 768)


In [122]:
def cos(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
trn_qa_sim = np.array([cos(trn_q_embs[i], trn_ans_embs[i]) for i in range(len(trn_q_embs))])
te_qa_sim = np.array([cos(te_q_embs[i], te_ans_embs[i]) for i in range(len(te_q_embs))])
print(trn_qa_sim.shape)

(7185,)


In [124]:
trn_q_embs.shape

(7185, 768)

In [125]:
trn_x = np.hstack([trn_q_embs, trn_ans_embs, trn_qa_sim.reshape(-1, 1)])
print("training data shape: {}".format(str(trn_x.shape)))
te_x = np.hstack([te_q_embs, te_ans_embs, te_qa_sim.reshape(-1, 1)])
trn_y = df_trn.label.values
te_y = df_te.label.values

training data shape: (7185, 1537)


In [None]:
np.save("trn_oversampled.npy", trn_x)
np.save("te_corresponed.npy", te_x)

In [None]:
np.save("y_oversampled.npy", trn_y)
np.save("y_corresponed.npy", te_y)

In [128]:
# create model and do the training
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(trn_x, trn_y)
y_pred = rfc.predict(te_x)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(te_y, y_pred))

In [185]:
r_rfc_prob = rfc.predict_proba(te_x)

In [None]:
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.1f' if normalize else 'd'
    thresh = cm.max() / 2.

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    return plt

In [None]:
cm = confusion_matrix(te_y, y_nn_pred_c)
fig = plt.figure(figsize=(10, 10))
plot = plot_confusion_matrix(cm, classes=['Not_pairs','Pairs'], normalize=True, title='Confusion matrix')
plt.show()
print(cm)

In [147]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

def get_model():
    model = Sequential()
    model.add(Dense(128, input_shape=(1537,), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

In [242]:
nn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 128)               196864    
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 129       
Total params: 196,993
Trainable params: 196,993
Non-trainable params: 0
_________________________________________________________________


In [243]:
y_nn_pred

array([[0.00000000e+00],
       [0.00000000e+00],
       [2.92363763e-03],
       [0.00000000e+00],
       [1.16229057e-04],
       [0.00000000e+00],
       [0.00000000e+00],
       [6.19620085e-04],
       [0.00000000e+00],
       [4.57183123e-02],
       [0.00000000e+00],
       [0.00000000e+00],
       [1.19209290e-07],
       [0.00000000e+00],
       [2.61962414e-05],
       [2.98768282e-04],
       [0.00000000e+00],
       [1.90168619e-04],
       [3.57627869e-07],
       [2.98023224e-07],
       [0.00000000e+00],
       [0.00000000e+00],
       [3.27825546e-07],
       [3.70368361e-03],
       [0.00000000e+00],
       [0.00000000e+00],
       [2.30967999e-05],
       [8.94069672e-08],
       [2.89275646e-02],
       [0.00000000e+00],
       [2.08616257e-07],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [8.94069672e-08],
       [3.27825546e-07],
       [3.27825546e-07],
       [0.00000000e+00],
       [3.77702713e-03],
       [0.00000000e+00],


In [None]:
nn_model = get_model()
nn_model.fit(trn_x, trn_y, validation_data=(te_x, te_y), 
             batch_size=32, epochs=20)

In [None]:
threshold = 0.1
y_nn_pred_c = y_nn_pred.squeeze() > threshold
y_rfc_pred_c = r_rfc_prob[:, 1] > threshold
print(classification_report(te_y, y_nn_pred_c))
print("rfc")
print(classification_report(te_y, y_rfc_pred_c))

In [None]:
# y_nn_tr = nn_model.predict(trn_x)
# y_tr_rfc = rfc.predict_proba(trn_x)

threshold = 0.1
y_nn_tr_c = y_nn_tr.squeeze() > threshold
y_rfc_tr_c = y_tr_rfc[:, 1] > threshold
print(classification_report(trn_y, y_nn_tr_c))
print("rfc")
print(classification_report(trn_y, y_rfc_tr_c))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
print("precision:", 
      precision_score(te_y, y_nn_pred_c, pos_label=None,average='weighted'))
print("recall:", 
      recall_score(te_y, y_nn_pred_c, pos_label=None,average='weighted'))
print("f1:", 
      f1_score(te_y, y_nn_pred_c, pos_label=None,average='weighted'))

In [209]:
model_json = nn_model.to_json()
with open("nn_sim_1537.json", "w") as jf:
    jf.write(model_json)
nn_model.save_weights("nn_sim_1537.h5")

In [165]:
y_nn_pred = nn_model.predict_proba(te_x)
y_nn_pred.shape

(986, 1)

In [218]:
def embed_sentence(sentence):
#     sentence = preprocess_text(sentence)
    embs = bc.encode([sentence])[0]
    return embs

def prepare_pred_data(emb, ans_embs):
    length = ans_embs.shape[0]
    q_emb = np.array([emb for _ in range(length)])
    cos_sim = np.array([cos(emb, ans_embs[i]) for i in range(length)])
    return np.hstack([q_emb, ans_embs, cos_sim.reshape(-1, 1)])

def pred_prob_ranking(emb, ans_embs, model):
    pred_arr = prepare_pred_data(emb, ans_embs)
    preds = model.predict_proba(pred_arr).squeeze()
#     preds = model.predict_proba(pred_arr)[:, 1]
    ids = np.argsort(preds)[::-1]
    return ids, preds

def top_k(question, ans_emb, answers, model, k):
    results = []
    emb = embed_sentence(question)
    ids, preds = pred_prob_ranking(emb, ans_emb, model)
    for i in range(k):
        original_idx = ids[i]
        item = {}
        item['rank'] = i + 1
#         item['confidence'] = round(preds[original_idx], 3)
        item['answer'] = answers[original_idx]
        results.append(item)
    return results

In [None]:
u_answers = unique_answers.tolist()
bc_ans_embs = bc.encode(u_answers)

In [210]:
df.shape

(314, 3)

In [236]:
df_trn_one = df_trn[df_trn.label == 1]
df_trn_one = df_trn_one.drop_duplicates()
random_test = df_trn_one.sample(frac=0.2)

In [237]:
random_test = random_test.reset_index()
random_test.shape

(53, 4)

In [238]:
pos_count = 0
count = 0
for i in range(random_test.shape[0]):
    q = random_test.question[i]
    a = random_test.answer[i]
    t = top_k(random_test.question[i], un_ans_embs, ans_list, nn_model, 3)
    for item in t:
        if item['answer'] == a:
            count += 1
            break
print(count)
print(random_test.shape[0])

48
53


In [221]:
print("successful rate: ", count / random_test.shape[0])

successful rate:  0.7872340425531915


In [196]:
print("successful rate: ", count / random_test.shape[0])

In [198]:
q_idx = {}
count = 0
for q in q_total:
    if q in q_idx:
        continue
    q_idx[q] = count
    count += 1
print(len(q_idx))

301


In [205]:
q_emb = np.vstack([trn_q_embs, te_q_embs])
q_emb_idx = np.array(list(q_idx.values()))
un_q_emb = q_emb[q_emb_idx]

In [207]:
np.save("un_q_emb.npy", un_q_emb)
with open("q_to_idx.pkl", "wb") as f:
    pickle.dump(q_idx, f)

In [223]:
with open("ans_list.pkl", "wb") as f:
    pickle.dump(ans_list, f)

In [224]:
x_trn.shape

(784, 1536)