### Data Labelling
For the first stage SVM, the Suspicious Conversations Identification (SCI), we need labels of whether or not a conversation is suspicious. A suspicious conversation is defined as a conversation that contains at least one sexual predators. Since we are given the ids of all authors identified as sexual predators, we can write a new CSV file that tells us whether or not a conversation (identified by their id) is suspicious.

In [1]:
import xml.etree.ElementTree as ET
import csv

train_data_path = '.../.../data/svm_training_data/'
training_xml = ET.parse(train_data_path + 'training_data.xml')
root = training_xml.getroot()

pred_id_file = '.../.../data/pan12-sexual-predator-identification-training-corpus-2012-05-01/'
file = open(pred_id_file + 'pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt', 'r')
predators_id = file.read().splitlines()
# print(predators_id)

csv_labels = []
authors = []
for conversation in root:
    authors.clear()
    
    # find all unique authors in this conversation
    for message in conversation:
        author = message.find('author').text
        if author not in authors:
            authors.append(author)
    suspicious = False
    for author in authors:
        if author in predators_id:
            suspicious = True
            
    if suspicious:
        csv_labels.append([conversation.get('id'), 1])
    else:
        csv_labels.append([conversation.get('id'), 0])

# print(csv_labels)
with open(train_data_path + 'sci_labels.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(csv_labels)
print('Done!')

Done!


Let us do the same thing for test data. 

In [2]:
test_data_path = '.../.../data/svm_test_data/'
test_data_src = '.../.../data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
root = test_xml.getroot()

all_predators = {}
with open(test_data_src + 'pan12-sexual-predator-identification-groundtruth-problem1.txt', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        all_predators[row[0]] = 1

# metric
test_num_conv = len(root)
test_num_suspicious_conv = 0
test_num_predators = len(all_predators)
test_num_authors = 0

csv_labels = []
authors = {}
for conversation in root:
    suspicious = False
    for message in conversation:
        author = message.find('author').text
        if author not in authors:
            authors[author] = 1
        if author in all_predators:
            suspicious = True
    if suspicious:
        csv_labels.append([conversation.get('id'), 1])
        test_num_suspicious_conv += 1
    else:
        csv_labels.append([conversation.get('id'), 0])

# print(csv_labels)
with open(test_data_path + 'sci_labels.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(csv_labels)
    
test_num_authors = len(authors)
print("Num of Conv: {}\r\nNum of suspicious conv: {}\r\nNum authors: {}\r\nNum predators: {}".format(\
                                                                                                        test_num_conv,\
                                                                                                      test_num_suspicious_conv,\
                                                                                                      test_num_authors,\
                                                                                                      test_num_predators))

Num of Conv: 155128
Num of suspicious conv: 3737
Num authors: 218702
Num predators: 254
