In [2]:
from collections import defaultdict
from typing import List
import re
from pelutils import TickTock
from tqdm import tqdm

import pandas as pd
from efficient_apriori import apriori


# Hashtag Itemsets

In [3]:
df = pd.read_csv(
    "../data/tweets.csv", index_col=0, dtype=dict(userID=str, tweetID=str)
)
cdf = pd.read_csv("../data/candidates_with_id.csv", index_col=0, dtype=dict(id=str))
tweets = df.tweet

Use Regex to match each hashtag and count simple basket and item statistics 

In [4]:
hashtag_pattern = r"(?i)\#\w+"

def get_hashtags(tweets: List[str]) -> List[List[str]]:
    all_hashtags = defaultdict(lambda: 0)
    hashtags = list()
    for tweet in tweets:
        matches = []
        if not pd.isna(tweet):
            matches.extend(
                [x.lower().replace("#", "") for x in re.findall(hashtag_pattern, tweet)]
            )
        for m in matches:
            all_hashtags[m] += 1
        hashtags.append(tuple(matches))

    print(f"Unique hashtags n={len(all_hashtags.keys())}")
    print("Total mentions", sum(all_hashtags.values()))
    print("Total baskets", len(hashtags))
    print("Total non-empty baskets", tot := len([x for x in hashtags if x]))
    print("Top 10 hashtags")
    for i, k in enumerate(
        sorted(all_hashtags, key=all_hashtags.get, reverse=True)[:10]
    ):
        print("\t", i + 1, k, all_hashtags[k] / tot * 100)
    return [x for x in hashtags if x]

def print_res(itemsets, rules):
    m = 2
    for k in sorted(itemsets[m], key=itemsets[m].get, reverse=True):
        v = itemsets[m][k]
        if "dkpol" in k:
            print(f"{k}: {v}")
    for r in rules:
        if ("dkpol" in r.lhs or "dkpol" in r.rhs) and (len(r.rhs) > 1 or len(r.lhs) > 1):
            print(r)

Run the A-Priori algorithm

In [5]:
hashtags = get_hashtags(tweets)
itemsets, rules = apriori(hashtags, min_support=0.25 / 100, min_confidence=0.75)
print_res(itemsets, rules)

Unique hashtags n=4535
Total mentions 19522
Total baskets 53342
Total non-empty baskets 11885
Top 10 hashtags
	 1 dkpol 55.39755994951619
	 2 worlds2022 5.61211611274716
	 3 dkmedier 5.258729490954985
	 4 dkgreen 3.500210349179638
	 5 fv22 3.3151030710980223
	 6 ftvalg22 1.8342448464450989
	 7 periscope 0.8834665544804375
	 8 sundpol 0.8750525872949095
	 9 uddpol 0.7572570466975179
	 10 lec 0.7572570466975179
('dkmedier', 'dkpol'): 548
('dkgreen', 'dkpol'): 341
('dkpol', 'fv22'): 261
('dkpol', 'ftvalg22'): 187
('dkpol', 'sundpol'): 87
('dkpol', 'uddpol'): 65
('dkpol', 'eudk'): 57
('dkpol', 'eupol'): 52
('dkpol', 'fv15'): 44
('brydfri', 'dkpol'): 42
('dkpol', 'fv19'): 39
('dkpol', 'ftlive'): 38
('dkpol', 'valg2022'): 36
('bondampåborgen', 'dkpol'): 34
('dkbiz', 'dkpol'): 34


Partition the dataset based on political bloc and rerun A-Priori

In [6]:
def partition(all_tweets: List[str], bloc: str) -> List[str]:
    red = set("ABFQØÅ")
    tweets = list()
    for tweet, user_id in zip(tqdm(all_tweets), df.userID):
        matches = cdf[cdf.id == user_id].Party.tolist()
        if matches:
            if (matches[0] in red) == (bloc == "red"):
                tweets.append(tweet)
    return tweets

for name in "red", "blue":
    print("Bloc hashtags for", name)
    bloc_hashtags = get_hashtags(partition(tweets, name))
    itemsets, rules = apriori(
        bloc_hashtags, min_support=0.25 / 100, min_confidence=0.75
    )
    print_res(itemsets, rules)

Bloc hashtags for red


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53342/53342 [00:13<00:00, 3972.18it/s]


Unique hashtags n=2107
Total mentions 7673
Total baskets 22046
Total non-empty baskets 4779
Top 10 hashtags
	 1 dkpol 59.59405733417032
	 2 dkgreen 5.942665829671479
	 3 fv22 4.561623770663319
	 4 dkmedier 2.8457836367440885
	 5 ftvalg22 1.7367650136011719
	 6 uddpol 1.5484410964636954
	 7 radikale 0.8160703075957313
	 8 stemgrønt 0.7742205482318477
	 9 bondampåborgen 0.7114459091860221
	 10 sundpol 0.6695961498221386
('dkgreen', 'dkpol'): 232
('dkpol', 'fv22'): 136
('dkmedier', 'dkpol'): 103
('dkpol', 'ftvalg22'): 66
('dkpol', 'uddpol'): 51
('bondampåborgen', 'dkpol'): 34
('dkpol', 'radikale'): 27
('dkpol', 'sundpol'): 23
('dkpol', 'tænknyt'): 23
('dkpol', 'eupol'): 21
('dkpol', 'spolitik'): 17
('dkpol', 'dyrevelfærd'): 16
('dkbiz', 'dkpol'): 15
('dkpol', 'ligestilling'): 15
('debatten', 'dkpol'): 14
('covid19dk', 'dkpol'): 13
('dkpol', 'fv15'): 13
('dkpol', 'fv19'): 12
('dkpol', 'valg2022'): 12
{bondampåborgen, tænknyt} -> {dkpol} (conf: 1.000, supp: 0.004, lift: 1.680, conv: 4046871

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53342/53342 [00:13<00:00, 3909.36it/s]


Unique hashtags n=2176
Total mentions 7954
Total baskets 21292
Total non-empty baskets 4695
Top 10 hashtags
	 1 dkpol 57.9765708200213
	 2 dkmedier 7.987220447284344
	 3 ftvalg22 2.2790202342918
	 4 periscope 2.215122470713525
	 5 fv22 2.0873269435569752
	 6 dkgreen 1.4909478168264112
	 7 besserwisserne 1.0223642172523961
	 8 konservative 0.9158679446219382
	 9 culture 0.8306709265175719
	 10 fv15 0.8093716719914802
('dkmedier', 'dkpol'): 337
('dkpol', 'ftvalg22'): 95
('dkpol', 'fv22'): 76
('dkgreen', 'dkpol'): 60
('dkpol', 'eupol'): 31
('dkpol', 'fv15'): 31
('dkpol', 'jvdk'): 28
('dkpol', 'ftlive'): 27
('dkpol', 'fv19'): 26
('dkpol', 'tvsyd'): 25
('dkpol', 'drdinstemme'): 24
('dkpol', 'stopracisme'): 24
('dkpol', 'konservative'): 23
('dkpol', 'eudk'): 21
('dkpol', 'minkgate'): 17
('dkpol', 'stemliberalt'): 17
('dkpol', 'sundpol'): 17
('dkpol', 'kdpol'): 15
('dkpol', 'tv2valg'): 15
('dkpol', 'handicap'): 14
('dkpol', 'fv2022'): 13
('dkpol', 'stemla'): 13
{drdinstemme, fv15} -> {dkpol} 