In [1]:
from os.path import join
import pandas as pd
import csv

with open(join("data","iranian_users.csv")) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    users = {row[2]: {'row': row, 'tweets': [], 'scores': []} for row in list(csvreader)[1:]}

with open(join("data","iranian_tweets.csv")) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for tweet in csvreader:
        screen_name = tweet[3]
        if screen_name not in users:
            print("Not found: " + screen_name)
        else:
            users[screen_name]['tweets'].append(tweet)


Not found: user_screen_name


In [32]:
import json

with open(join('data', 'sources.json')) as infile:
    sources = json.load(infile)

In [42]:
from datetime import datetime, timedelta
from urllib.parse import urlparse

import pycountry


_countries_cache = {}
def get_country(name):
    c = _countries_cache.get(name, None)
    if name not in _countries_cache:
        c = pycountry.countries.get(name=name)
        if not c and len(name) == 2:
            c = pycountry.countries.get(alpha_2=name.upper())
        if not c and len(name) == 3:
            c = pycountry.countries.get(alpha_3=name.upper())
        if not c:
            c = pycountry.countries.get(official_name=name)
        _countries_cache[name] = c
        if not country:
            print("failed to find country for " + name)
    return c


not_in_sources = set([])
dataset_sources_factuality = {}


user_counter = 0
users_with_no_tweet = set([])
for screen_name, profile in users.items():
    n = len(profile['tweets'])

    interface_language = profile['row'][9].lower()[:2]
    timezone = profile['row'][9].lower()[:2]
#     reported_location = profile['row'][3]
#     if reported_location:
#         reported_location = reported_location.split(',')[-1].strip()
#         country = get_country(reported_location)

    if n == 0:
        users_with_no_tweet.add(screen_name)
        profile['scores'] = None
        continue

    language_score_unmatched = 0
    tweet_time_bin = [0] * 24
    linked_to_not_factual_domain = 0
    linked_to_known_domain = 0
    for tweet in profile['tweets']:
        if tweet[11][:2].lower() != interface_language:
            language_score_unmatched += 1

        tweet_time = datetime.strptime(tweet[13], '%Y-%m-%d %H:%M')
        tweet_time_bin[tweet_time.hour] += 1

        urls = tweet[-3][2:-2].split()
        for url in urls:
            domain = urlparse(url).netloc.lower()
            if domain not in sources and domain.count('.') > 1:
                domain = domain[domain.index('.')+1:]
            if domain not in sources:
                not_in_sources.add(domain)
                continue
            linked_to_known_domain += 1
            factual = sources[domain][0]['factual']
            if factual not in dataset_sources_factuality:
                dataset_sources_factuality[factual] = 1
            else:
                dataset_sources_factuality[factual] += 1
            if factual == 'MIXED':
                linked_to_not_factual_domain += 0.5
            elif factual != 'HIGH':
                linked_to_not_factual_domain += 1

    max_sliding_time_window_sum = 0
    for i in range(24):
        sliding_time_window_sum = sum(tweet_time_bin[i:(i+8) % 24])
        if sliding_time_window_sum > max_sliding_time_window_sum:
            max_sliding_time_window_sum = sliding_time_window_sum

    # Normalization
    language_score = language_score_unmatched / n
    working_time_score = max_sliding_time_window_sum / n
    if linked_to_known_domain > 0:
        linked_to_domains_score = linked_to_not_factual_domain / linked_to_known_domain
    else:
        linked_to_domains_score = 0

    profile['scores'] = (language_score, working_time_score, linked_to_domains_score)

    print(profile['scores'])
    print()
    user_counter += 1
    if user_counter == 12:
        break

print("{num} users are suspended without having any tweets in the dataset: {list}".format(
    num=len(users_with_no_tweet), list=', '.join([s[:16] for s in users_with_no_tweet])))
print()
print("Domains not in Media Bias/Fact Check db: " + ', '.join(list(not_in_sources)))
print()
print("Overall factuality of the links in the tweets:" + str(dataset_sources_factuality))


(0.9921208141825345, 0.6559422193040052, 0)

(0.12903225806451613, 0.967741935483871, 0)

(0.21075984470327233, 0.5973377703826955, 0.23684210526315788)

(0.1541038525963149, 0.6348408710217756, 0.7708333333333334)

(1.0, 0.868421052631579, 0)

(1.0, 0.7169230769230769, 0)

(1.0, 0.6, 0)

(0.9795918367346939, 0.7931972789115647, 0.1)

(0.9913606911447084, 0.7192224622030238, 0)

(1.0, 0.6934306569343066, 0)

(0.9972144846796658, 0.7465181058495822, 0)

(1.0, 0.7033898305084746, 0)

7 users are suspended without having any tweets in the dataset: 79b10b6396d4c863, 6e395ce51ae771f4, f3d8b36e08acc468, 8b1498b609a3e879, 1c2bac6ec8313377, caaa14567470b25d, 299d67846470c44d

Domains not in Media Bias/Fact Check db: howsecureismypassword.net, hill.cm, owl.li, slate.me, amn.st, venturebeat.com, maannews.com, palinfo.com, compelling.org.uk, misspeachy.co, nejatngo.org, malayalamsearch.com, farsnews.com, buff.ly, youtube.com, facebook.com, ln.is, thr.cm, atfp.co, activistpost.com, libertyfrontpre