
# Data Preprocessing

This notebook processes Twitter customer support corpus from kaggle.

First of all you need to [download the original corpus](https://www.kaggle.com/thoughtvector/customer-support-on-twitter/activity) and save the "twcs.csv" file into the data/twitter/ folder.


In [None]:
import pandas as pd
# Load Data
kaggle_df = pd.read_csv("../data/twitter/twcs.csv")

In [None]:
pd.options.display.max_colwidth = 500
kaggle_df.head()

In [None]:
len(kaggle_df)

For simplicity in the next steps lets convert the csv file to a dictionary that converts ids to tweets.

In [None]:
from tqdm import tqdm_notebook as tqdm
def csv2dict(dataframe):
    id2tweet = {}
    for index, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        tweet_id, author_id, inbound, created_at, text, response_tweet_id, in_response_tweet_id = row
        id2tweet[tweet_id] = {"author_id": author_id, 
                              "inbound": inbound,
                              "created_at": created_at,
                              "text": text,
                              "response_tweet_id": response_tweet_id,
                              "in_response_tweet_id": in_response_tweet_id}
    return id2tweet

In [None]:
id2tweet = csv2dict(kaggle_df)

In [None]:
id2tweet[119237]

Following the preprocessing done by  [Hardalov et al. (2018)](https://arxiv.org/abs/1809.00303) we will select only the tweets related to the Apple support. 

In [None]:
def get_apple_tweets(id2tweet):
    apple_tweets = []
    for _id, body in tqdm(id2tweet.items()):
        if body["author_id"] == "AppleSupport":
            apple_tweets.append({**{"tweet_id": _id}, **body})
    return apple_tweets

In [None]:
apple_tweets = get_apple_tweets(id2tweet)

Each apple tweet in the is a possible answers. We now only need to get the context that triggered that answer.

Note: We will truncate the context to a max of 150 words.

In [None]:
from nltk.tokenize import TweetTokenizer
import numpy as np

def get_tweet_context(tweets, id2tweet):
    tokenizer = TweetTokenizer()
    qa_pairs = []
    for tweet in tqdm(tweets):
        context = []
        current_tweet = tweet
        while len(context) < 150 and not np.isnan(current_tweet["in_response_tweet_id"]):
            try: 
                previous_tweet = id2tweet[current_tweet["in_response_tweet_id"]]
                context += tokenizer.tokenize(previous_tweet["text"]) + ["eottoken"]
                current_tweet = previous_tweet
            except KeyError:
                break
        # in this corpus we have some answers that have no context. We will not consider those
        if len(context) > 0: 
            qa_pairs.append({"context": ' '.join(context), "answer": tweet["text"], "label": 1, "created_at": tweet["created_at"]})
    return qa_pairs

In [None]:
tweet_pairs = get_tweet_context(apple_tweets, id2tweet)

In [None]:
len(tweet_pairs)

In this apple tweets several of them redirect the user to the DM's and do not provide a clear answer. For this reason we will try to exclude those.

In [None]:
def filter_redirected_tweets(qa_pairs):
    new_pairs = []
    for pair in qa_pairs:
        if "DM" not in pair["answer"]:
            new_pairs.append(pair)
    return new_pairs

def filter_nonenglish_tweets(qa_pairs):
    new_pairs = []
    for pair in qa_pairs:
        if  "We offer support via Twitter in English" not in pair["answer"]\
        and "We offer support via Twitter in English" not in pair["context"] \
        and "Twitter support is available in English" not in pair["answer"] \
        and "Twitter support is available in English" not in pair["context"]:
            new_pairs.append(pair)
    return new_pairs

In [None]:
filtered_pairs = filter_nonenglish_tweets(filter_redirected_tweets(tweet_pairs))
len(filtered_pairs)

In [None]:
filtered_pairs[3:5]

As in to what [Hardalov et al. (2018)](https://arxiv.org/abs/1809.00303) reported, we obtain 49k Context/Answer pairs!

With this pairs we can start preparing our data to feed our models!

# Text clean up

In [None]:
import re

def remove_URL(s):
    return re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/[a-zA-Z0-9]*', 'toberepalcedwithurltoken', s)

def remove_user_id(s):
    return re.sub('@[0-9]+', 'useridtoken', s)

def remove_apple_id(s):
    return re.sub('@AppleSupport', 'appleidtoken', s)

def normalize_string(s):
    s = remove_URL(s)
    s = remove_user_id(s)
    s = remove_apple_id(s)
    s = s.lower()
    s = re.sub(r'toberepalcedwithurltoken', '_URL_', s)
    s = re.sub(r'useridtoken', '_USERID_', s)
    s = re.sub(r'appleidtoken', '_APPLE_', s)
    s = re.sub(r'eottoken', '_EOT_', s)
    return s

In [None]:
for pair in tqdm(filtered_pairs):
    pair["context"] = normalize_string(pair["context"])
    pair["answer"] = normalize_string(pair["answer"])

In [None]:
filtered_pairs[3:5]

# Corpus analysis

In [None]:
def corpus_analysis(pairs):
    question_word_count, answer_word_count = 0, 0
    question_max, answer_max = 0, 0
    question_min, answer_min = 1000, 1000
    tokenizer = TweetTokenizer()
    for sample in pairs:
        # we only need to tokenize the answer because the context was already tokenized and a simple
        # srting split will do the work
        answer = tokenizer.tokenize(sample["answer"])
        if len(sample["context"].split()) > question_max:
            question_max = len(sample["context"].split())
        if len(answer) > answer_max:
            answer_max = len(answer)
        if len(sample["context"].split()) < question_min:
            question_min = len(sample["context"].split())
        if len(answer) < answer_min:
            answer_min = len(answer)
        question_word_count += len(sample["context"].split())
        answer_word_count += len(answer)        
    return question_word_count/len(pairs), answer_word_count/len(pairs), question_max, answer_max, question_min, answer_min

In [None]:
avg_q_words, avg_a_words, q_max, a_max, q_min, a_min = corpus_analysis(filtered_pairs)
avg_q_words, avg_a_words, q_max, a_max, q_min, a_min, len(filtered_pairs)

as you can see there the min length value for the question is 0... we will filter those questions for lacking context.

# Data Split

Lets split the QA pairs into a train, validation and test. For the validation and test sets we will use tweets from the last 5 days of this dataset (Note: The dataset contains only tweets until 03 Dec 2017)


In [None]:
train = []
test_pairs = []
for pair in filtered_pairs:
    if pair["created_at"].split()[1] == "Dec":
        test_pairs.append(pair)
    elif pair["created_at"].split()[1] == "Nov" and pair["created_at"].split()[2] > "29":
        test_pairs.append(pair)
    else:
        train.append(pair)

dev_size = int(0.5 * len(test_pairs))
dev = test_pairs[:dev_size]
test = test_pairs[-dev_size:]

In [None]:
len(train), len(test_pairs), len(dev), len(test)

# Negative sampling
Now that we have our pairs splitted into different sets we can build negative examples to train the Dual Encoders.

Similar to the Pinterest data in order to avoid correct answers with negative labels we will compare the answers in a TF-IDF space when building the pairs.


Example of common answers:
- _USERID_ Here’s what you can do to work around the issue until it’s fixed in a future software update: _URL_
- _USERID_  An update has been released to assist with this issue. If you haven’t yet, please back up your device and update it to iOS 11.1.1. For more info, check out: _URL_
- _USERID_ iOS 11.1.1 was recently released and it includes a fix for autocorrect issues. Be sure to back up your device prior to updating, and let us know if the issue persists afterwards. How to back up: _URL_

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, strip_accents='ascii')

In [None]:
vectorizer.fit([sample["answer"] for sample in train])

In [None]:
np.random.seed(42)
def generate_encoders_data(pairs, vectorizer, racio=1):
    encoders_data = []
    for i in tqdm(range(len(pairs))):
        sample = {"label": 1, "context": pairs[i]["context"], "answer": pairs[i]["answer"]}
        encoders_data.append(sample.copy())
        count = 0
        while count < racio:
            random_idx = np.random.randint(0, len(pairs))
            if cosine_similarity(vectorizer.transform([pairs[i]["answer"]]), vectorizer.transform([pairs[random_idx]["answer"]])) < 0.85:
                sample = {"label": 0, "context": pairs[i]["context"], "answer": pairs[random_idx]["answer"]}
                encoders_data.append(sample.copy())
                count += 1
    return encoders_data

In [None]:
train_de_data = generate_encoders_data(train, vectorizer)
dev_de_data = generate_encoders_data(dev, vectorizer)
test_de_data = generate_encoders_data(test, vectorizer)

In [None]:
# Run this coder if you want to save inspect the data into json.
"""
import json
with open('../data/twitter/de-train.json', 'w') as outfile:
    json.dump(train_de_data, outfile)

with open('../data/twitter/de-dev.json', 'w') as outfile:
    json.dump(dev_de_data, outfile)

with open('../data/twitter/de-test.json', 'w') as outfile:
    json.dump(test_de_data, outfile)
"""

## Ranking Data
With the Dev and Test sets we will also create the data for the ranking task.

In [None]:
positive_samples = dev + test

In [None]:
len(positive_samples) # number of positive examples that will be used to create the ranking data.

In [None]:
ranking_data = []
for sample in positive_samples:
    ranking_batch = {"context": sample["context"], "candidates": [sample["answer"]]}
    random_idxs = np.random.randint(0, len(positive_samples), 9)
    for i in range(random_idxs.shape[0]):
        ranking_batch["candidates"].append(positive_samples[random_idxs[i]]["answer"])
    ranking_data.append(ranking_batch)

In [None]:
ranking_data[2]["context"], ranking_data[2]["candidates"][0], ranking_data[2]["candidates"][1]

In [None]:
with open('../data/twitter/ranking.json', 'w') as outfile:
    json.dump(ranking_data, outfile)

### Checkpoint
Load all the data that was computed in the cells above.

In [None]:
import json
train_de_data = json.loads(open('../data/twitter/de-train.json', 'r').read())
dev_de_data = json.loads(open('../data/twitter/de-dev.json', 'r').read())
test_de_data = json.loads(open('../data/twitter/de-test.json', 'r').read())
ranking_data = json.loads(open('../data/twitter/ranking.json', 'r').read())

# Tokenization
now that we have our QA pairs formed, filtered and splitted into different sets we can start builduing our vocabulary and tokenize the documents.

In [None]:
import os
if not os.path.exists('../data/twitter/tmp/'):
    os.makedirs('../data/twitter/tmp/')

In [None]:
def tokenize_texts(pairs):
    # Remenber that we have already done tokenization for the context... 
    # this means that for the context we just need to split the string
    c_toks = [sample["context"].split() for sample in pairs]
    tokenizer = TweetTokenizer()
    a_toks = [tokenizer.tokenize(sample["answer"]) for sample in pairs]
    labels = [sample["label"] for sample in pairs]
    return c_toks, a_toks, np.array(labels)

In [None]:
trn_c_toks, trn_a_toks, trn_y = tokenize_texts(train_de_data)
dev_c_toks, dev_a_toks, dev_y = tokenize_texts(dev_de_data)
test_c_toks, test_a_toks, test_y = tokenize_texts(test_de_data)

In [None]:
print (trn_c_toks[0])

#### Vocabulary & Vectorization
Now that we have everything tokenized we can build our vocabulary and vectorize everything.

In [None]:
from collections import Counter
def build_vocabulary(tokenized_trn_docs, max_vocab = 60000, min_freq = 1):
    freq = Counter(p for o in (tokenized_trn_docs) for p in o)
    words_list = [o for o, c in freq.most_common(max_vocab) if c > min_freq and o != ' '] # vocab ordered by frequency
    words_list.insert(0, "_EOS_")
    words_list.insert(0, "_BOS_")
    words_list.insert(0, "_UNK_")
    words_list.insert(0, "_PAD_")
    vocabulary = {}
    for word in words_list:
        vocabulary[word] = len(vocabulary)
    return freq, words_list, vocabulary
frequencies, words_list, vocabulary = build_vocabulary(trn_c_toks+trn_a_toks)

In [None]:
"vocabulary size: {}".format(len(vocabulary))

In [None]:
"Top 10 most common words:", frequencies.most_common(10)

In [None]:
print("From {} questions and {} answers ({} documents) the vocabulary size is {}".format(len(trn_c_toks), len(trn_a_toks), len(trn_c_toks+trn_a_toks), len(vocabulary)))

In [None]:
print (words_list[:10])

In [None]:
import pickle
pickle.dump(vocabulary, open('../data/twitter/tmp/word2ix.pkl', 'wb'))

# Vectorization
With our vocabulary and our documents splitted into tokens we can represent our documents as arrays in which each entry represents the index of a specific word in our vocabulary.

In [None]:
def vectorize(docs, vocab):
    vec_docs = []
    for doc in docs:
        vec_doc = []
        for o in doc:
            try:
                if o != ' ':
                    vec_doc.append(vocab[o])
            except KeyError:
                vec_doc.append(vocab["_UNK_"])
        vec_docs.append(vec_doc)
    return np.array(vec_docs)

In [None]:
trn_c_vecs = vectorize(trn_c_toks, vocabulary) 
trn_a_vecs = vectorize(trn_a_toks, vocabulary)

In [None]:
dev_c_vecs = vectorize(dev_c_toks, vocabulary)
dev_a_vecs = vectorize(dev_a_toks, vocabulary)

In [None]:
test_c_vecs = vectorize(test_c_toks, vocabulary)
test_a_vecs = vectorize(test_a_toks, vocabulary)

In [None]:
print (trn_c_vecs[0])

#### Save Dual Encoders Data

In [None]:
pickle.dump((np.array(trn_c_vecs), np.array(trn_a_vecs), trn_y), open('../data/twitter/tmp/de_train.pkl', 'wb'))
pickle.dump((np.array(dev_c_vecs), np.array(dev_a_vecs), dev_y), open('../data/twitter/tmp/de_dev.pkl', 'wb'))
pickle.dump((np.array(test_c_vecs), np.array(test_a_vecs), test_y), open('../data/twitter/tmp/de_test.pkl', 'wb'))

### Prepare the ranking data.

In [None]:
def tokenize_ranking_data(data):
    q_toks = [sample["context"].split() for sample in tqdm(data)]
    tokenizer = TweetTokenizer()
    c_toks = [[tokenizer.tokenize(candidate) for candidate in sample["candidates"]] for sample in tqdm(data)]
    return q_toks, c_toks

In [None]:
q_toks, c_toks = tokenize_ranking_data(ranking_data)

In [None]:
context_vecs = vectorize(q_toks, vocabulary)
candidate_vecs = [vectorize(candidates, vocabulary) for candidates in c_toks]

In [None]:
e1_inputs = []
e2_inputs = []
for i in range(context_vecs.shape[0]):
    e1_inputs.append(np.stack((context_vecs[i] for k in range(10))))
    e2_inputs.append(candidate_vecs[i])
print (len(e1_inputs), len(e2_inputs))

In [None]:
pickle.dump((e1_inputs, e2_inputs), open('../data/twitter/tmp/ranking.pkl', 'wb'))

## Sequence-to-sequence Model Data
For our sequece-to-sequence model we just need to select the positive samples from trein, dev and test sets and append the BOS and EOS tokens.

In [None]:
trn_q_vecs, trn_a_vecs, trn_y = pickle.load(open('../data/twitter/tmp/de_train.pkl', 'rb'))

In [None]:
idxs = np.nonzero(trn_y)

In [None]:
idxs[0].shape

In [None]:
trn_src_seqs = trn_q_vecs[idxs[0]]
trn_tgt_seqs = trn_a_vecs[idxs[0]]

In [None]:
trn_src_seqs.shape, trn_tgt_seqs.shape

In [None]:
print (trn_src_seqs[0])
print (trn_tgt_seqs[0])

In [None]:
def add_seq2seq_tokens(sequences, bos_token, eos_token):
    for seq in sequences:
        seq.insert(0, bos_token)
        seq.append(eos_token)
    return sequences

In [None]:
trn_src_seqs = add_seq2seq_tokens(trn_src_seqs, vocabulary["_BOS_"], vocabulary["_EOS_"])
trn_tgt_seqs = add_seq2seq_tokens(trn_tgt_seqs, vocabulary["_BOS_"], vocabulary["_EOS_"])

In [None]:
print (trn_src_seqs[0])
print (trn_tgt_seqs[0])

In [None]:
# lets do the same for dev and test.
dev_q_vecs, dev_a_vecs, dev_y = pickle.load(open('../data/twitter/tmp/de_dev.pkl', 'rb'))
idxs = np.nonzero(dev_y)
dev_src_seqs = dev_q_vecs[idxs[0]]
dev_tgt_seqs = dev_a_vecs[idxs[0]]
dev_src_seqs = add_seq2seq_tokens(dev_src_seqs, vocabulary["_BOS_"], vocabulary["_EOS_"])
dev_tgt_seqs = add_seq2seq_tokens(dev_tgt_seqs, vocabulary["_BOS_"], vocabulary["_EOS_"])

In [None]:
test_q_vecs, test_a_vecs, test_y = pickle.load(open('../data/twitter/tmp/de_test.pkl', 'rb'))
idxs = np.nonzero(test_y)
test_src_seqs = test_q_vecs[idxs[0]]
test_tgt_seqs = test_a_vecs[idxs[0]]
test_src_seqs = add_seq2seq_tokens(test_src_seqs, vocabulary["_BOS_"], vocabulary["_EOS_"])
test_tgt_seqs = add_seq2seq_tokens(test_tgt_seqs, vocabulary["_BOS_"], vocabulary["_EOS_"])

We can now safely save our inputs and output sequences to the tmp folder.

In [None]:
pickle.dump((trn_src_seqs, trn_tgt_seqs), open('../data/twitter/tmp/seq2seq_train.pkl', 'wb'))
pickle.dump((dev_src_seqs, dev_tgt_seqs), open('../data/twitter/tmp/seq2seq_dev.pkl', 'wb'))
pickle.dump((test_src_seqs, test_tgt_seqs), open('../data/twitter/tmp/seq2seq_test.pkl', 'wb'))

### The End