## Utility Functions

In [43]:
from datetime import datetime, timedelta
import json
import re
from urllib.parse import urlparse


mention_finder = re.compile(r'@\w+ ?')
symbols_finder = re.compile(r'[^\w]')
def normalize_text(text):
    text = text.lower()
    if text.startswith('rt '):
        text = text[3:]
    text = mention_finder.sub('', text)
    text = symbols_finder.sub(' ', text)
    text = text.replace('  ', ' ')
    return text.strip()
_test_normalize_text = normalize_text('RT @xyz_ @opl Haha! Hoho!')
if _test_normalize_text != 'haha hoho':
    raise Exception("Not implemented correctly? '{0}'".format(_test_normalize_text))

DATASET_DATE = datetime(2018, 9, 1)

# Load the Media Bias/Fact Check db
with open(join('data', 'sources.json')) as infile:
    sources = json.load(infile)

FACTUAL_HALF = set(['MIXED'])
FACTUAL_COMPLETELY = set(['HIGH', 'VERY HIGH'])
DOMAIN_ALIASES = {
    'presstv.ir': 'presstv.com',
    'politi.co': 'politico.com',
}

# A failed attemp to write a function to decode the reported_location field
# import pycountry
# _countries_cache = {}
# def get_country(name):
#     c = _countries_cache.get(name, None)
#     if name not in _countries_cache:
#         c = pycountry.countries.get(name=name)
#         if not c and len(name) == 2:
#             c = pycountry.countries.get(alpha_2=name.upper())
#         if not c and len(name) == 3:
#             c = pycountry.countries.get(alpha_3=name.upper())
#         if not c:
#             c = pycountry.countries.get(official_name=name)
#         _countries_cache[name] = c
#         if not c:
#             print("failed to find country for " + name)
#     return c


## Reading the Dataset

In [53]:
import csv
from os.path import join
import pandas as pd

with open(join("data","iranian_users.csv")) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    users = {row[2]: {'row': row, 'tweets': [], 'scores': None} for row in csvreader if row[0] != 'userid'}

tweet_texts = {}
repeated_texts = {}
repeaters_among_self_tweets = {}
repeaters_among_all_tweets = {}
    
tweet_counter = 0
missing_users = set([])
with open(join("data","iranian_tweets.csv")) as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for tweet in csvreader:
        screen_name = tweet[3]
        if screen_name not in users:
            missing_users.add(screen_name)
        else:
            users[screen_name]['tweets'].append(tweet)
            tweet_counter += 1

        t = normalize_text(tweet[12])
        if len(t) < 5:
            continue
        if t not in tweet_texts:
            tweet_texts[t] = {screen_name: 1, '__total__': 1}
        elif screen_name not in tweet_texts[t]:
            tweet_texts[t][screen_name] = 1
            tweet_texts[t]['__total__'] += 1
        else:
            tweet_texts[t][screen_name] += 1
            tweet_texts[t]['__total__'] += 1

        if tweet_texts[t][screen_name] > 2:
            if screen_name not in repeaters_among_self_tweets:
                repeaters_among_self_tweets[screen_name] = 1
            else:
                repeaters_among_self_tweets[screen_name] += 1

        if tweet_texts[t]['__total__'] > 4:
            repeated_texts[t] = tweet_texts[t]['__total__']
            if screen_name not in repeaters_among_all_tweets:
                repeaters_among_all_tweets[screen_name] = 1
            else:
                repeaters_among_all_tweets[screen_name] += 1

missing_users.remove('user_screen_name')
if missing_users:
    print("Missing users with tweets: " + ', '.join(list(not_in_sources)))
    print()

print("Loaded {tn} tweets from {un} users".format(tn=tweet_counter, un=len(users)))

Loaded 1122936 tweets from 770 users


In [60]:
from math import log10

repeated_texts_sorted = sorted([(v, k) for k, v in repeated_texts.items()], key=lambda x:x[0], reverse=True)
print("Repeated texts:")
for i in range(min(10, len(repeated_texts_sorted))):
    print("{n:5} {s}".format(n=repeated_texts_sorted[i][0], s=repeated_texts_sorted[i][1]))
    print(tweet_texts[repeated_texts_sorted[i][1]])

repeaters_among_self_tweets_max = log10(max(repeaters_among_self_tweets.values()))
repeaters_among_all_tweets_max = log10(max(repeaters_among_all_tweets.values()))

print()
print("repeaters_among_self_tweets_max={0}".format(repeaters_among_self_tweets_max))
print("repeaters_among_all_tweets_max={0}".format(repeaters_among_all_tweets_max))

Repeated texts:
10025 ce qu ils ne vous diront jamais sur noel http  t co qsgh5j9rxt
{'marialuis91': 10025, '__total__': 10025}
 9232 ce qu ils ne vous diront jamais sur noel http  t co w563d4cdji
{'a51115862ba4725c846e77683e9c71d1b1eb246100ca394f1b915f9c7909099d': 9232, '__total__': 9232}
 2274 ce qu ils ne vous diront jamais sur noel http  t co jpxd3bawcm
{'marialuis91': 2274, '__total__': 2274}
 2073 coupe du monde  l image que les caméras n ont pas voulu voir http  t co nztgbyg5mr
{'marialuis91': 2073, '__total__': 2073}
 2022 united states a structurally racist society http  t co ykqfm3kyjc
{'a51115862ba4725c846e77683e9c71d1b1eb246100ca394f1b915f9c7909099d': 2022, '__total__': 2022}
 1987 charlie hebdo les bourdes bizarres faites par les terroristes http  t co aviaibnj29
{'a51115862ba4725c846e77683e9c71d1b1eb246100ca394f1b915f9c7909099d': 1987, '__total__': 1987}
 1790 charlie hebdo les bourdes bizarres faites par les terroristes http  t co rnidkwbngn
{'marialuis91': 1790, '__tota

## Calculating the Scores for each User

In [62]:
not_in_sources = set([])
dataset_sources_factuality = {}


users_with_no_tweet = set([])
max_account_age = None
min_account_age = None

print('''Score Dimensions:
    creation? account age?,
    language_score,
    working_time_score,
    linked_to_domains_score,
    repetition_among_self_tweets,
    repetition_among_all_tweets,
    has_burst,
''')

user_counter = 0
for screen_name, profile in users.items():
    n = len(profile['tweets'])

    interface_language = profile['row'][9].lower()[:2]
    timezone = profile['row'][9].lower()[:2]
    account_age = (DATASET_DATE - datetime.strptime(profile['row'][8], '%Y-%m-%d')).days
    if not max_account_age or max_account_age < account_age:
        max_account_age = account_age
    if not min_account_age or min_account_age > account_age:
        min_account_age = account_age
    reported_location = profile['row'][3]
    if reported_location:
        reported_location = reported_location.split(',')[-1].strip()
        country = get_country(reported_location)

    if n == 0:
        users_with_no_tweet.add(screen_name)
        profile['scores'] = None
        continue

    language_score_unmatched = 0
    tweet_time_bin = [0] * 24
    linked_to_not_factual_domain = 0
    linked_to_known_domain = 0
    has_burst = False
    last_tweet_timestamp = None
    for tweet in profile['tweets']:
        if tweet[11][:2].lower() != interface_language:
            language_score_unmatched += 1

        if tweet[13] == last_tweet_timestamp:
            has_burst = True

        tweet_time = datetime.strptime(tweet[13], '%Y-%m-%d %H:%M')
        tweet_time_bin[tweet_time.hour] += 1

        urls = tweet[-3][2:-2].split()
        for url in urls:
            domain = urlparse(url).netloc.lower()
            if domain not in sources and domain.count('.') > 1:
                domain = domain[domain.index('.')+1:]
            if domain not in sources and domain in DOMAIN_ALIASES:
                domain = DOMAIN_ALIASES[domain]
            if domain not in sources and domain.count('.') > 1:
                domain = 'www.' + domain
            if domain not in sources:
                not_in_sources.add(domain)
                continue
            linked_to_known_domain += 1
            factual = sources[domain][0]['factual']
            if factual not in dataset_sources_factuality:
                dataset_sources_factuality[factual] = 1
            else:
                dataset_sources_factuality[factual] += 1
            if factual in FACTUAL_HALF:
                linked_to_not_factual_domain += 0.5
            elif factual not in FACTUAL_COMPLETELY:
                linked_to_not_factual_domain += 1

    max_sliding_time_window = 0
    max_sliding_time_window_sum = 0
    for i in range(24):
        sliding_time_window_sum = sum(tweet_time_bin[i:(i+8) % 24])
        if sliding_time_window_sum > max_sliding_time_window_sum:
            max_sliding_time_window_sum = sliding_time_window_sum
            max_sliding_time_window = i

    # Normalization
    language_score = language_score_unmatched / n
    working_time_score = ((max_sliding_time_window_sum / n) - (1/3)) * 3 / 2
    if linked_to_known_domain > 0:
        linked_to_domains_score = linked_to_not_factual_domain / linked_to_known_domain
    else:
        linked_to_domains_score = 0
    repetition_among_self_tweets_score = 0
    if screen_name in repeaters_among_self_tweets:
        repetition_among_self_tweets_score = log10(repeaters_among_self_tweets[screen_name]) / repeaters_among_self_tweets_max
    repetition_among_all_tweets_score = 0
    if screen_name in repeaters_among_all_tweets:
        repetition_among_all_tweets_score = log10(repeaters_among_all_tweets[screen_name]) / repeaters_among_all_tweets_max

    profile['scores'] = [
        account_age,  # Will be normalized outside this loop
        language_score,
        working_time_score,
        linked_to_domains_score,
        repetition_among_self_tweets_score,
        repetition_among_all_tweets_score,
        1 if has_burst else 0,
    ]

    last_tweet_timestamp = tweet[13]

    if user_counter == 32:
        break
    user_counter += 1

user_counter = 0
for screen_name, profile in users.items():
    if profile['scores']:
        if max_account_age and min_account_age and max_account_age > min_account_age:
            profile['scores'][0] = (profile['scores'][0] - min_account_age) / (max_account_age - min_account_age)

        score = sum(profile['scores'])
        print('{name:16}: {score:6.2f} = ({vector})'.format(
            name=screen_name[:16], score=score, vector=', '.join(['{0:3.1f}'.format(v) for v in profile['scores']])))

        if user_counter == 32:
            break
        user_counter += 1
    
print()
print("{num} users are suspended without having any tweets in the dataset: {list}".format(
    num=len(users_with_no_tweet), list=', '.join([s[:16] for s in users_with_no_tweet])))
print()
print("Domains not in the Media Bias/Fact Check db: " + ', '.join(list(not_in_sources)))
print()
print("Overall factuality of the links in the tweets:" + str(dataset_sources_factuality))


Score Dimensions:
    creation? account age?,
    language_score,
    working_time_score,
    linked_to_domains_score,
    repetition_among_self_tweets,
    repetition_among_all_tweets,
    has_burst,

57e2082d64baa89d:   1.87 = (0.3, 1.0, 0.5, 0.0, 0.0, 0.1, 0.0)
11891c406c088fdd:   2.19 = (1.0, 0.1, 1.0, 0.0, 0.1, 0.1, 0.0)
5ddbd530097789a4:   1.80 = (0.3, 0.2, 0.4, 0.2, 0.1, 0.6, 0.0)
50012d5e4f959a3d:   2.10 = (0.6, 0.2, 0.5, 0.7, 0.0, 0.1, 0.0)
28478f20c217a672:   2.04 = (0.1, 1.0, 0.8, 0.0, 0.0, 0.1, 0.0)
260bbf8c9ca24c63:   1.87 = (0.3, 1.0, 0.6, 0.0, 0.0, 0.0, 0.0)
3f1a40fa0636db86:   1.67 = (0.1, 1.0, 0.4, 0.0, 0.0, 0.1, 0.0)
bc2adb39c920650d:   2.54 = (0.3, 1.0, 0.7, 0.1, 0.3, 0.3, 0.0)
fd01dd625797c4f4:   1.77 = (0.2, 1.0, 0.6, 0.0, 0.0, 0.0, 0.0)
2d761afd8c25c25e:   2.16 = (0.1, 1.0, 0.5, 0.0, 0.2, 0.3, 0.0)
f3776839f137b8fd:   2.44 = (0.6, 1.0, 0.6, 0.0, 0.0, 0.2, 0.0)
c2577510f436a286:   1.83 = (0.3, 1.0, 0.6, 0.0, 0.0, 0.0, 0.0)
0669e7730d2cacb6:   1.66 = (0.1, 1.0, 0.6,

## Experiment: Clustering

In [63]:
from sklearn.cluster import KMeans

labels = {}

kmeans = KMeans(n_clusters=4)
kmeans.fit([profile['scores'] for profile in users.values() if profile['scores']])
for profile in users.values():
    if profile['scores']:
        label = int(kmeans.predict([profile['scores']]))
        if label not in labels:
            labels[label] = [profile]
        else:
            labels[label].append(profile)

for label, profiles in labels.items():
    print("{num} users in cluser {label}:".format(num=len(profiles), label=label))
    print('\n'.join(['{name:16}: {vector}'.format(name=p['row'][2][:16], vector=' '.join(['{0:3.1f}'.format(v) for v in p['scores']])) for p in profiles]))
    print()


21 users in cluser 0:
57e2082d64baa89d: 0.3 1.0 0.5 0.0 0.0 0.1 0.0
28478f20c217a672: 0.1 1.0 0.8 0.0 0.0 0.1 0.0
260bbf8c9ca24c63: 0.3 1.0 0.6 0.0 0.0 0.0 0.0
3f1a40fa0636db86: 0.1 1.0 0.4 0.0 0.0 0.1 0.0
bc2adb39c920650d: 0.3 1.0 0.7 0.1 0.3 0.3 0.0
fd01dd625797c4f4: 0.2 1.0 0.6 0.0 0.0 0.0 0.0
2d761afd8c25c25e: 0.1 1.0 0.5 0.0 0.2 0.3 0.0
f3776839f137b8fd: 0.6 1.0 0.6 0.0 0.0 0.2 0.0
c2577510f436a286: 0.3 1.0 0.6 0.0 0.0 0.0 0.0
0669e7730d2cacb6: 0.1 1.0 0.6 0.0 0.0 0.0 0.0
67179a5165a72056: 0.4 1.0 1.0 0.0 0.0 0.0 0.0
13af1122b942a674: 0.2 1.0 1.0 0.0 0.0 0.1 0.0
3c536b8d54ca334c: 0.2 1.0 0.3 0.0 0.1 0.0 0.0
aa757d486c6d8bfc: 0.3 1.0 0.8 0.0 0.0 0.2 0.0
0a00d0bfdd68bf53: 0.6 1.0 0.7 0.0 0.0 0.2 0.0
a3d8eccd0b8b2cec: 0.4 1.0 0.8 0.0 0.3 0.2 0.0
254403684f5a792b: 0.4 1.0 0.5 0.0 0.2 0.2 0.0
507cb5a10918355e: 0.1 1.0 0.6 0.0 0.1 0.2 0.0
62ae746780e9ca28: 0.1 1.0 0.9 0.0 0.0 0.0 0.0
983a5489354e160d: 0.0 1.0 1.0 0.0 0.0 0.0 0.0
9907036861c8bf7a: 0.0 1.0 0.9 0.0 0.0 0.0 0.0

3 users in 