In [278]:
import os
import json
import re
import pandas as pd
from transformers import BertModel, BertTokenizerFast, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim

train_data_path = './train_dev_data/train/'
dev_data_path = './train_dev_data/dev/'
test_data_path = './project-data/tweet-objects'

In [259]:
train_events = 0
train_rumour = 0
train_non_rumour = 0
labels_train = []

dev_events = 0
dev_rumour = 0
dev_non_rumour = 0
labels_dev = []

with open("./project-data/train.label.txt", "r") as f:
    for line in f.readlines():
        labels_train.append(line[:-1])
        train_events += 1
        if line[:-1] == 'rumour':
            train_rumour += 1
        else:
            train_non_rumour += 1
            
with open("./project-data/dev.label.txt", "r") as f:
    for line in f.readlines():
        labels_dev.append(line[:-1])
        dev_events += 1
        if line[:-1] == 'rumour':
            dev_rumour += 1
        else:
            dev_non_rumour += 1
    
print("There are", train_events, "events in the training set (", train_rumour, "rumour", ",", train_non_rumour, " non_rumour )")
print("There are", dev_events, "events in the development set (", dev_rumour, "rumour", ",", dev_non_rumour, " non_rumour )")

There are 1895 events in the training set ( 420 rumour , 1475  non_rumour )
There are 632 events in the development set ( 139 rumour , 493  non_rumour )


In [282]:
events_train = []
events_dev = []
events_test = []

with open("./project-data/train.data.txt", "r") as f:
    for line in f.readlines():
        events_train.append(line[:-1].split(","))
        
with open("./project-data/dev.data.txt", "r") as f:
    for line in f.readlines():
        events_dev.append(line[:-1].split(","))
        
with open("./project-data/test.data.txt", "r") as f:
    for line in f.readlines():
        events_test.append(line[:-1].split(","))

#events = lists of all events, the first id is the source id

In [261]:
train_tweet_events = []
index = 0

for i in range(len(events_train)):
    event = dict()
    event['index'] = index
    event['ids'] = events_train[i]
    event['is_rumour'] = labels_train[i]
    index += 1
    train_tweet_events.append(event)

dev_tweet_events = []
index = 0

for i in range(len(events_dev)):
    event = dict()
    event['index'] = index
    event['ids'] = events_dev[i]
    event['is_rumour'] = labels_dev[i]
    index += 1
    dev_tweet_events.append(event)

In [264]:
def feature_extraction(tweet):
    # compiling features of each tweet object
    text = tweet['text']
    text = re.sub('@[\S]+', '', text).lower() #remove @mention
    text = re.sub('https://[\S]+', '', text) #remove url
    text = re.sub('http://[\S]+', '', text)
    text = re.sub('[\n]+', ' ', text).strip() #remove \n
    text = re.sub('[\r]+', ' ', text).strip() #remove \r
    text = re.sub(r'[^\w\s]','',text)

    '''lang = tweet['lang']
    author_id = tweet['author_id']

    if 'context_annotations' in tweet:
        entity = tweet['context_annotations'][0]['entity']
        entity_id = entity['id']
        entity_name = entity['name']
        if 'description' in tweet['context_annotations'][0]['domain']:
            author_des = tweet['context_annotations'][0]['domain']['description']
            author_removal_list = []
            author_des_tokens = set(tt.tokenize(author_des.lower()))

            for token in author_des_tokens:
                # if token is a link
                if (token.startswith('https') or token.startswith('http')):
                    author_removal_list.append(token)

                # if token doesn't contain any letters
                elif not any(char.isalpha() for char in token):
                    author_removal_list.append(token)

                # if token is found in stopwords
                elif (token in stopwords):
                    author_removal_list.append(token)

            # remove all compiled tokens in for loop above
            for token in author_removal_list:
                author_des_tokens.remove(token)

        else:
            author_des_tokens = None
    else:
        entity_id = data['entity_name'] = data['author_des'] = None

    if 'entities' in tweet:
        hashtags = []
        mentions = []
        annotations = []
        if 'hashtags' in tweet['entities']:
            for hashtag in tweet['entities']['hashtags']:
                hashtags.append(hashtag['tag'])
        if 'mentions' in tweet['entities']:
            for mention in tweet['entities']['mentions']:
                mentions.append(mention['id'])
        if 'annotations' in tweet['entities']:
            for annotation in tweet['entities']['annotations']:
                annotations.append((annotation['normalized_text'], annotation['type'], annotation['probability']))

    # compiling data
    data = [new_text, lang, author_id, entity_id, entity_name, author_des_tokens, hashtags, mentions, annotations]'''
    data = text
    
    return data

In [266]:
index = 0
train_invalid_events = []
train_events_data = []    
    
for event in train_tweet_events:
    is_rumour = event['is_rumour']
    tweet_objects = []
    event_list = "[CLS] "
    valid = True
    ex = False
   
    for tweet_id in event['ids']:
        try:
            with open('./train_dev_data/train/' + tweet_id + '.json') as tweet_str:
                for line in tweet_str:
                    tweet = json.loads(line)
                    tweet_objects.append(tweet)
        except:
            continue
    
    if (len(tweet_objects) == 0):
        train_invalid_events.append(index)
        valid = False
        
    if valid:
        for tweet in tweet_objects:            
            data = feature_extraction(tweet)
            if data:
                event_list += data + " [SEP] "
    
    index += 1
    if valid:
        train_events_data.append((event_list, is_rumour))
        
print(len(train_events_data), "events written to training input file, (", len(train_invalid_events), "invalid events )")

1806 events written to training input file, ( 89 invalid events )


In [267]:
index = 0
dev_invalid_events = []
dev_events_data = []    
    
for event in dev_tweet_events:
    is_rumour = event['is_rumour']
    tweet_objects = []
    event_list = "[CLS] "
    valid = True
   
    for tweet_id in event['ids']:
        try:
            with open('./train_dev_data/dev/' + tweet_id + '.json') as tweet_str:
                for line in tweet_str:
                    tweet = json.loads(line)
                    tweet_objects.append(tweet)
        except:
            continue
    
    if (len(tweet_objects) == 0):
        dev_invalid_events.append(index)
        valid = False
        
    else:
        for tweet in tweet_objects:            
            data = feature_extraction(tweet)
            if data:
                event_list += data + " [SEP] "
            
    index += 1
    if valid:
        dev_events_data.append((event_list, is_rumour))

print(len(dev_events_data), " events written to development input file, (", len(dev_invalid_events), "invalid events )")

594  events written to development input file, ( 38 invalid events )


In [268]:
header = ["Event Tweets", "Label"]

train_csv = pd.DataFrame(columns = header, data = train_events_data)
train_csv.to_csv('./train_dev_data/train.csv', index = False)
dev_csv = pd.DataFrame(columns = header, data = dev_events_data)
dev_csv.to_csv('./train_dev_data/dev.csv', index = False)

In [328]:
index = 0
test_invalid_events = []
test_events_data = []    

for event in events_test:
    source_tweet = event[0]
    
    tweet_objects = []
    event_list = "[CLS] "
    valid = True

    for tweet_id in event:
        try:
            with open('./project-data/tweet-objects/' + tweet_id + '.json') as tweet_str:
                for line in tweet_str:
                    tweet = json.loads(line)
                    tweet_objects.append(tweet)
        except:
            continue
            
    if (len(tweet_objects) == 0):
        test_invalid_events.append(index)
        valid = False
        
    else:
        for tweet in tweet_objects:            
            data = feature_extraction(tweet)
            if data:
                event_list += data + " [SEP] "
                
    if ((event_list == "[CLS] ") and valid):
        test_invalid_events.append(index)
        valid = False
            
    index += 1
    if valid:
        test_events_data.append((source_tweet, event_list))
    else:
        test_events_data.append((source_tweet, 'invalid'))
        
print(len(test_events_data) - len(test_invalid_events), "events written to test input file, (", len(test_invalid_events), "invalid events )")

555 events written to test input file, ( 3 invalid events )


In [327]:
header = ["Source_Tweet", "Event Tweets"]

test_csv = pd.DataFrame(columns = header, data = test_events_data)
test_csv.to_csv('./test_data/test.csv', index = False)

In [210]:
# obtaining features and their counts from the tweet object obtained from the training set
# (total of 21775 tweet objects)
def get_features(tweet, features):
    for feat in tweet:
        if feat == 'context_annotations':
            for feat2 in tweet[feat][0]:
                if type(tweet[feat][0][feat2]) == dict:
                    for feat3 in tweet[feat][0][feat2]:
                        if feat in features:
                            if feat2 in features[feat]:
                                if feat3 in features[feat][feat2]:
                                    features[feat][feat2][feat3] += 1
                                else:
                                    features[feat][feat2][feat3] = 1
                            else:
                                features[feat][feat2] = dict()
                                features[feat][feat2][feat3] = 1
                        else:
                            features[feat] = dict()
                            features[feat][feat2] = dict()
                            features[feat][feat2][feat3] = 1
                else:
                    if feat2 in features[feat]:
                        features[feat][feat2] += 1
                    else:
                        features[feat][feat2] = 1
        elif type(tweet[feat]) == dict:
            for feat2 in tweet[feat]:
                if feat in features:
                    if feat2 in features[feat]:
                        features[feat][feat2] += 1
                    else:
                        features[feat][feat2] = 1
                else:
                    features[feat] = dict()
                    features[feat][feat2] = 1
        else:
            if feat in features:
                features[feat] += 1
            else:
                features[feat] = 1
                
features = dict()
features['context_annotations'] = dict()
                
for event in dev_tweet_events:
    tweet_objects = []
   
    for tweet_id in event['ids']:
        try:
            with open('./train_dev_data/dev/' + tweet_id + '.json') as tweet_str:
                for line in tweet_str:
                    tweet = json.loads(line)
                    tweet_objects.append(tweet)
        except:
            continue
    
    if (len(tweet_objects) != 0):
        for tweet in tweet_objects:            
            get_features(tweet, features)

features

{'context_annotations': {'domain': {'id': 1561,
   'name': 1561,
   'description': 1561},
  'entity': {'id': 1561, 'name': 1561, 'description': 437}},
 'reply_settings': 7576,
 'public_metrics': {'retweet_count': 7576,
  'reply_count': 7576,
  'like_count': 7576,
  'quote_count': 7576},
 'attachments': {'media_keys': 1474, 'poll_ids': 10},
 'entities': {'urls': 2315,
  'hashtags': 1250,
  'mentions': 6201,
  'annotations': 2256},
 'created_at': 7576,
 'author_id': 7576,
 'lang': 7576,
 'conversation_id': 7576,
 'text': 7576,
 'id': 7576,
 'possibly_sensitive': 7576,
 'source': 7569,
 'in_reply_to_user_id': 7141,
 'referenced_tweets': 7150,
 'geo': {'place_id': 263, 'coordinates': 144},
 'withheld': {'copyright': 2, 'country_codes': 2}}