### Reading the Dataset

In [2]:
from os.path import join
import pandas as pd
import csv

with open(join("data","iranian_users.csv")) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    users = {row[2]: {'row': row, 'tweets': [], 'scores': None} for row in csvreader if row[0] != 'userid'}

tweet_counter = 0
missing_users = set([])
with open(join("data","iranian_tweets.csv")) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for tweet in csvreader:
        screen_name = tweet[3]
        if screen_name not in users:
            missing_users.add(screen_name)
        else:
            users[screen_name]['tweets'].append(tweet)
            tweet_counter += 1

missing_users.remove('user_screen_name')
if missing_users:
    print("Missing users with tweets: " + ', '.join(list(not_in_sources)))
    print()

print("Loaded {tn} tweets from {un} users".format(tn=tweet_counter, un=len(users)))

Loaded 1122936 tweets from 770 users


### Utility Functions

In [12]:
from datetime import datetime, timedelta
import json
from urllib.parse import urlparse


DATASET_DATE = datetime(2018, 9, 1)

# Load the Media Bias/Fact Check db
with open(join('data', 'sources.json')) as infile:
    sources = json.load(infile)

FACTUAL_HALF = set(['MIXED'])
FACTUAL_COMPLETELY = set(['HIGH', 'VERY HIGH'])

# A failed attemp to write a function to decode the reported_location field
# import pycountry
# _countries_cache = {}
# def get_country(name):
#     c = _countries_cache.get(name, None)
#     if name not in _countries_cache:
#         c = pycountry.countries.get(name=name)
#         if not c and len(name) == 2:
#             c = pycountry.countries.get(alpha_2=name.upper())
#         if not c and len(name) == 3:
#             c = pycountry.countries.get(alpha_3=name.upper())
#         if not c:
#             c = pycountry.countries.get(official_name=name)
#         _countries_cache[name] = c
#         if not country:
#             print("failed to find country for " + name)
#     return c

### Calculating the Scores for each User

In [17]:
not_in_sources = set([])
dataset_sources_factuality = {}


users_with_no_tweet = set([])
max_account_age = None
min_account_age = None

user_counter = 0
for screen_name, profile in users.items():
    n = len(profile['tweets'])

    interface_language = profile['row'][9].lower()[:2]
    timezone = profile['row'][9].lower()[:2]
    account_age = (DATASET_DATE - datetime.strptime(profile['row'][8], '%Y-%m-%d')).days
    if not max_account_age or max_account_age < account_age:
        max_account_age = account_age
    if not min_account_age or min_account_age > account_age:
        min_account_age = account_age
#     reported_location = profile['row'][3]
#     if reported_location:
#         reported_location = reported_location.split(',')[-1].strip()
#         country = get_country(reported_location)

    if n == 0:
        users_with_no_tweet.add(screen_name)
        profile['scores'] = None
        continue

    language_score_unmatched = 0
    tweet_time_bin = [0] * 24
    linked_to_not_factual_domain = 0
    linked_to_known_domain = 0
    for tweet in profile['tweets']:
        if tweet[11][:2].lower() != interface_language:
            language_score_unmatched += 1

        tweet_time = datetime.strptime(tweet[13], '%Y-%m-%d %H:%M')
        tweet_time_bin[tweet_time.hour] += 1

        urls = tweet[-3][2:-2].split()
        for url in urls:
            domain = urlparse(url).netloc.lower()
            if domain not in sources and domain.count('.') > 1:
                domain = domain[domain.index('.')+1:]
            if domain not in sources:
                not_in_sources.add(domain)
                continue
            linked_to_known_domain += 1
            factual = sources[domain][0]['factual']
            if factual not in dataset_sources_factuality:
                dataset_sources_factuality[factual] = 1
            else:
                dataset_sources_factuality[factual] += 1
            if factual in FACTUAL_HALF:
                linked_to_not_factual_domain += 0.5
            elif factual not in FACTUAL_COMPLETELY:
                linked_to_not_factual_domain += 1

    max_sliding_time_window_sum = 0
    for i in range(24):
        sliding_time_window_sum = sum(tweet_time_bin[i:(i+8) % 24])
        if sliding_time_window_sum > max_sliding_time_window_sum:
            max_sliding_time_window_sum = sliding_time_window_sum

    # Normalization
    language_score = language_score_unmatched / n
    working_time_score = ((max_sliding_time_window_sum / n) - (1/3)) * 3 / 2
    if linked_to_known_domain > 0:
        linked_to_domains_score = linked_to_not_factual_domain / linked_to_known_domain
    else:
        linked_to_domains_score = 0

    profile['scores'] = [
        account_age,  # To be normalized
        language_score,
        working_time_score,
        linked_to_domains_score
    ]

    if user_counter == 12:
        break
    user_counter += 1

user_counter = 0
for screen_name, profile in users.items():
    if profile['scores']:
        if max_account_age and min_account_age and max_account_age > min_account_age:
            profile['scores'][0] = (profile['scores'][0] - min_account_age) / (max_account_age - min_account_age)

        score = sum(profile['scores'])
        print('{name}: {score:6.2f} = ({vector})'.format(
            name=screen_name[:16], score=score, vector=', '.join(['{0:3.1f}'.format(v) for v in profile['scores']])))
        if user_counter == 12:
            break
        user_counter += 1

print()
print("{num} users are suspended without having any tweets in the dataset: {list}".format(
    num=len(users_with_no_tweet), list=', '.join([s[:16] for s in users_with_no_tweet])))
print()
print("Domains not in the Media Bias/Fact Check db: " + ', '.join(list(not_in_sources)))
print()
print("Overall factuality of the links in the tweets:" + str(dataset_sources_factuality))


57e2082d64baa89d:   1.76 = (0.3, 1.0, 0.5, 0.0)
11891c406c088fdd:   2.08 = (1.0, 0.1, 1.0, 0.0)
5ddbd530097789a4:   1.10 = (0.3, 0.2, 0.4, 0.2)
50012d5e4f959a3d:   1.99 = (0.6, 0.2, 0.5, 0.8)
28478f20c217a672:   1.93 = (0.1, 1.0, 0.8, 0.0)
260bbf8c9ca24c63:   1.87 = (0.3, 1.0, 0.6, 0.0)
3f1a40fa0636db86:   1.55 = (0.1, 1.0, 0.4, 0.0)
bc2adb39c920650d:   2.02 = (0.3, 1.0, 0.7, 0.1)
fd01dd625797c4f4:   1.77 = (0.2, 1.0, 0.6, 0.0)
2d761afd8c25c25e:   1.67 = (0.1, 1.0, 0.5, 0.0)
f3776839f137b8fd:   2.26 = (0.6, 1.0, 0.6, 0.0)
c2577510f436a286:   1.83 = (0.3, 1.0, 0.6, 0.0)
0669e7730d2cacb6:   1.66 = (0.1, 1.0, 0.6, 0.0)

7 users are suspended without having any tweets in the dataset: 79b10b6396d4c863, 6e395ce51ae771f4, 1c2bac6ec8313377, caaa14567470b25d, f3d8b36e08acc468, 8b1498b609a3e879, 299d67846470c44d

Domains not in the Media Bias/Fact Check db: jamieoliver.com, fb.me, mjhosts.com, shar.es, reddit.com, aynanewsagency.or, sarahabed.com, mirataljazeera.org, ln.is, slate.me, alalam.ir, 