In [1]:
import os
import ast
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# MIND Dataset import

In [2]:
# Import MIND Dataset - Behaviors and News
# Update the path below to point to your extracted MIND dataset folder
MIND_DATA_PATH = "dataset/"  # Change this to your dataset path

print("Loading MIND Dataset...")
print("="*50)

# Load behaviors data
# Format: impression_id, user_id, time, history, impressions
try:
    behaviors_columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']
    behaviors_df = pd.read_csv(
        os.path.join(MIND_DATA_PATH, 'behaviors.tsv'), 
        sep='\t', 
        header=None, 
        names=behaviors_columns,
        encoding='utf-8'
    )
    print(f"✓ Behaviors dataset loaded successfully!")
    print(f"  Shape: {behaviors_df.shape}")
    print(f"  Unique users: {behaviors_df['user_id'].nunique():,}")
    
except FileNotFoundError:
    print("❌ behaviors.tsv not found.")
    print("   Please download from: https://www.kaggle.com/datasets/arashnic/mind-news-dataset")
    behaviors_df = None

# Load news data  
# Format: news_id, category, subcategory, title, abstract, url, title_entities, abstract_entities
try:
    news_columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
    news_df = pd.read_csv(
        os.path.join(MIND_DATA_PATH, 'news.tsv'), 
        sep='\t', 
        header=None, 
        names=news_columns,
        encoding='utf-8'
    )
    print(f"✓ News dataset loaded successfully!")
    print(f"  Shape: {news_df.shape}")
    print(f"  Unique categories: {news_df['category'].nunique()}")
    
except FileNotFoundError:
    print("❌ news.tsv not found.")
    news_df = None

print("\n" + "="*50)

# Display sample data
if behaviors_df is not None:
    print("=== BEHAVIORS DATA SAMPLE ===")
    print(behaviors_df.head())
    print(f"\nData types:\n{behaviors_df.dtypes}")

if news_df is not None:
    print("\n=== NEWS DATA SAMPLE ===")
    print(news_df.head())
    print(f"\nData types:\n{news_df.dtypes}")

Loading MIND Dataset...
✓ Behaviors dataset loaded successfully!
  Shape: (156965, 5)
  Unique users: 50,000
✓ News dataset loaded successfully!
  Shape: (51282, 8)
  Unique categories: 17

=== BEHAVIORS DATA SAMPLE ===
   impression_id user_id                   time  \
0              1  U13740  11/11/2019 9:05:58 AM   
1              2  U91836  11/12/2019 6:11:30 PM   
2              3  U73700  11/14/2019 7:01:48 AM   
3              4  U34670  11/11/2019 5:28:05 AM   
4              5   U8125  11/12/2019 4:11:21 PM   

                                             history  \
0  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  N31739 N6072 N63045 N23979 N35656 N43353 N8129...   
2  N10732 N25792 N7563 N21087 N41087 N5445 N60384...   
3  N45729 N2203 N871 N53880 N41375 N43142 N33013 ...   
4                        N10078 N56514 N14904 N33740   

                                         impressions  
0                                  N55689-1 N35729-0  
1  N20678-0 N39317-0 N5811

In [3]:
news_df['title_entities'][0]

'[{"Label": "Prince Philip, Duke of Edinburgh", "Type": "P", "WikidataId": "Q80976", "Confidence": 1.0, "OccurrenceOffsets": [48], "SurfaceForms": ["Prince Philip"]}, {"Label": "Charles, Prince of Wales", "Type": "P", "WikidataId": "Q43274", "Confidence": 1.0, "OccurrenceOffsets": [28], "SurfaceForms": ["Prince Charles"]}, {"Label": "Elizabeth II", "Type": "P", "WikidataId": "Q9682", "Confidence": 0.97, "OccurrenceOffsets": [11], "SurfaceForms": ["Queen Elizabeth"]}]'

Get all WIKI IDs.

In [4]:
import json

def extract_entity_ids(row):
    ids = set()
    # Process both title_entities and abstract_entities
    for field in ['title_entities', 'abstract_entities']:
        value = row.get(field, None)
        # If it's a JSON string, parse it
        if isinstance(value, str):
            try:
                entities = json.loads(value)
            except json.JSONDecodeError:
                entities = []
        # If it's already a list (e.g. pre‑parsed), use it directly
        elif isinstance(value, list):
            entities = value
        # Otherwise (NaN, None, float, etc.), skip
        else:
            entities = []
        # Extract WikidataId from each entity dict
        for ent in entities:
            wid = ent.get('WikidataId')
            if isinstance(wid, str):
                ids.add(wid)
    return list(ids)

# Apply to add the new column without repeats
news_df['all_entity_ids'] = news_df.apply(extract_entity_ids, axis=1)

# (Optional) Verify the result
print(news_df[['news_id', 'all_entity_ids']].head())


  news_id                all_entity_ids
0  N55528       [Q80976, Q43274, Q9682]
1  N19639                     [Q193583]
2  N61837                        [Q212]
3  N53526                     [Q155223]
4  N38324  [Q3179593, Q171171, Q371820]


In [32]:
unique_ids_set = set()
for ids in news_df['all_entity_ids']:
    unique_ids_set.update(ids)
wikidata_unique_ids = list(unique_ids_set)
print("Number of unique entity IDs:", len(wikidata_unique_ids))
print(wikidata_unique_ids[:10])


Number of unique entity IDs: 27759
['Q335821', 'Q5313500', 'Q22097181', 'Q756421', 'Q800071', 'Q5599426', 'Q516515', 'Q3241019', 'Q1419007', 'Q1764969']


Convert wikidata qIDs of topics to DBpedia entries

In [33]:
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm.auto import tqdm

qids   = wikidata_unique_ids      # your list
chunk  = 100                                       # batch size
sparql = SPARQLWrapper("https://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

m = {}                                             # Q‑ID → <dbpedia URI>
for i in tqdm(range(0, len(qids), chunk), desc="batches"):
    batch = qids[i:i+chunk]
    values = " ".join(f"(wikidata:{q})" for q in batch)
    sparql.setQuery(f"""
      PREFIX owl:      <http://www.w3.org/2002/07/owl#>
      PREFIX wikidata: <http://www.wikidata.org/entity/>
      SELECT ?qid ?dbr WHERE {{
        VALUES (?qid) {{ {values} }}
        ?dbr owl:sameAs ?qid .
      }}
    """)
    for b in sparql.query().convert()['results']['bindings']:
        q  = b['qid']['value'].split('/')[-1]
        m[q] = f"<{b['dbr']['value']}>"

dbpedia_uris = [m.get(q) for q in qids]            # e.g. "<http://dbpedia.org/resource/Tianhe-2>"
print(dbpedia_uris)


batches: 100%|██████████| 278/278 [06:54<00:00,  1.49s/it]






In [None]:
print(f"Converted {len(dbpedia_uris):,} Wikidata Q-IDs to DBpedia URIs")
print(dbpedia_uris[:10])


Converted 27,759 Wikidata Q-IDs to DBpedia URIs
['<http://dbpedia.org/resource/Dillsburg,_Pennsylvania>', '<http://dbpedia.org/resource/Duluth_City_Council>', '<http://dbpedia.org/resource/Cody_Whitehair>', '<http://dbpedia.org/resource/Intracoastal_Waterway>', '<http://dbpedia.org/resource/County_highway>', '<http://dbpedia.org/resource/Great_Lakes_Brewing_Company>', '<http://dbpedia.org/resource/Steny_Hoyer>', '<http://dbpedia.org/resource/Rhinebeck_(village),_New_York>', '<http://dbpedia.org/resource/Nissan_Pathfinder>', '<http://dbpedia.org/resource/Dolemite>']


In [35]:
import json, pathlib

cache = pathlib.Path("./data_cache/MIND_topics_cache.json")
cache.parent.mkdir(parents=True, exist_ok=True)        # make folder if needed

# build {qid: uri}   – ensure keys are str so json.dump won't complain
data = {str(q): uri for q, uri in zip(qids, dbpedia_uris)}

with cache.open("w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"saved {len(data):,} pairs : {cache}")


saved 27,759 pairs : data_cache\MIND_topics_cache.json


Make new df for MIND dataset with attributes ['news_id','all_entity_ids'] and cache it.


In [45]:
import pathlib

entity_df = pd.DataFrame({
    'news_id': news_df['news_id'],
    'all_entity_ids': news_df['all_entity_ids']
})

save_path = pathlib.Path("./data_cache/entity_df.pkl")
save_path.parent.mkdir(parents=True, exist_ok=True)

entity_df.to_pickle(save_path)      # fast binary format
print("saved to", save_path)


saved to data_cache\entity_df.pkl


# TweetsKB dataset import

In [None]:
# TweetsKB reference: Fafalios P. et al., WWW 2018
from rdflib import Graph, URIRef, Literal
from tqdm import tqdm
import itertools, os, pandas as pd, collections, re, warnings, logging

# ───── silence rdflib “invalid URI” chatter ───────────────────────────
warnings.filterwarnings(
    "ignore",
    message=re.escape("does not look like a valid URI, trying to serialize this will break."),
    category=UserWarning,
    module="rdflib.term"
)
logging.getLogger("rdflib.term").setLevel(logging.ERROR)

def load_tweetsKB(path, dbpedia_uris,
                  *, chunk_lines=50_000, max_chunks=None):
    """
    Parse TweetsKB N3 file streaming‑style and return a DataFrame.

    Parameters
    ----------
    path : str
        Path to .n3 file.
    dbpedia_uris : set[str]
        Whitelist of DBpedia URIs.  A tweet is kept only if *all* its entities
        are in this set.
    chunk_lines : int, default 50_000
        Number of lines read per chunk.
    max_chunks : int | None, default None
        • int  → stop after this many chunks (useful for testing).
        • None → parse the whole file.
    """
    tweets = collections.defaultdict(lambda: {
        'user_node': None,
        'user': None,
        'time': None,
        'ents': set()
    })
    user_id_of_node = {}       # blank‑node → identifier literal
    skip_enc = skip_uri = 0

    prefixes = ''
    chunk_ct = 0

    # chunk settings
    if max_chunks is not None:
        bar_total, bar_unit = max_chunks, 'chunk'
    else:
        bar_total, bar_unit = os.path.getsize(path), 'B'

    with open(path, 'rb') as fh, tqdm(total=bar_total,
                                      unit=bar_unit,
                                      unit_scale=(bar_unit == 'B')) as bar:
        first_chunk = True
        while True:
            raw = list(itertools.islice(fh, chunk_lines))
            if not raw:
                break
            chunk_ct += 1
            # progress bar update
            bar.update(1 if max_chunks is not None else
                       sum(len(x) for x in raw))

            # stop early if requested
            if max_chunks is not None and chunk_ct > max_chunks:
                break


            try:
                txt = b''.join(raw).decode('utf-8')
            except UnicodeDecodeError:
                skip_enc += 1
                continue

            # cache @prefix lines once (needed for later chunks)
            if first_chunk:
                prefixes = '\n'.join(
                    ln for ln in txt.splitlines()
                    if ln.startswith('@prefix') or ln.startswith('@base')
                ) + '\n'
                first_chunk = False

            # parse chunk into graph
            g = Graph()
            try:
                g.parse(data=prefixes + txt, format='n3')
            except Exception:
                skip_enc += 1
                continue

            # map basic info
            mention2tweet = {}
            for s, p, o in g:
                p_str = str(p)

                # tweet → mention blank node
                if 'mentions' in p_str:
                    mention2tweet[str(o)] = str(s)

                # tweet → user node
                elif 'creator' in p_str or 'has_creator' in p_str:
                    tweets[str(s)]['user_node'] = str(o)

                # tweet timestamp
                elif 'created' in p_str or 'date' in p_str:
                    tweets[str(s)]['time'] = str(o)

                # user node → identifier literal
                elif any(k in p_str for k in ('identifier', 'accountName', 'id', 'name')) \
                     and isinstance(o, Literal):
                    user_id_of_node[str(s)] = str(o)

            # match URIs
            for m, p, uri in g:
                if ('hasMatchedURI' in str(p) and isinstance(uri, URIRef) and
                    ('/entity/' in uri or '/resource/' in uri)):
                    t_id = mention2tweet.get(str(m))
                    if t_id:
                        tweets[t_id]['ents'].add(str(uri))

            g.close()

            # resolve user IDs we’ve just learned
            for t_id, rec in tweets.items():
                if rec['user'] is None and rec['user_node'] in user_id_of_node:
                    rec['user'] = user_id_of_node[rec['user_node']]

    # build the df
    rows = []
    for t_id, rec in tweets.items():
        if rec['ents'] and rec['ents'].issubset(dbpedia_uris):
            rows.append({
                'postid': t_id,
                'userid': rec['user'],
                'time_created': rec['time'],
                'tweet_uris': list(rec['ents'])
            })
        else:
            skip_uri += 1

    print(f'kept {len(rows)}, skipped_encoding {skip_enc}, skipped_uri {skip_uri}')
    return pd.DataFrame(rows)


# process DBpedia URIs
dbpedia_uris_stripped = {
    re.sub(r'^<|>$', '', u)
    for u in dbpedia_uris
    if isinstance(u, str) and u
}

tweetsKB_df = load_tweetsKB(
    './dataset/month_2019-10.n3',
    dbpedia_uris_stripped,
    chunk_lines=50_000,   
    max_chunks=100        # None for full dataset
)


  0%|          | 0.00/31.4G [00:00<?, ?B/s]

  1%|          | 224M/31.4G [02:38<6:32:53, 1.32MB/s] https://www.smh.com.au/national/nsw/sydney-faces-days-of-smoke-haze-from-nsw-bushfire-20191030-p535jl.html?cspt=1572385102|5cf36b0cf38cabde00fdfd5f0dfc1744/ does not look like a valid URI, trying to serialize this will break.
  1%|          | 254M/31.4G [03:02<6:13:11, 1.39MB/s]


KeyboardInterrupt: 

In [15]:
# ── cache tweetsKB_df to disk ───────────────────────────────
tweetsKB_df.to_csv(
    './data_cache/tweetsKB_oct2019.csv',     # filename on disk
    index=False,                # no row numbers
    encoding='utf-8'            # UTF‑8 text encoding
)


In [14]:
print(f"Loaded TweetsKB with {len(tweetsKB_df)} records")
tweetsKB_df.head(10)

Loaded TweetsKB with 345 records


Unnamed: 0,postid,userid,time_created,tweet_uris
0,fe7bf795931d54de09ba4cd50ecfba977b31011,9d73fd425a8d26211b794dd803141452,2019-10-01T17:46:22,"[http://dbpedia.org/resource/MarketWatch, http..."
1,fe7bf795931d54de09ba4cd50ecfba977b32429,307579d9c3eb0b917cdcba2552596b54,2019-10-01T18:19:53,[http://dbpedia.org/resource/Verizon_Communica...
2,fe7bf795931d54de09ba4cd50ecfba977b34453,4a1f96b340de6f2aad1db949f9bad25d,2019-10-01T19:07:13,[http://dbpedia.org/resource/Minimum_wage]
3,fe7bf795931d54de09ba4cd50ecfba977b1705,3046a0f8f411d3183ce3632751ea2215,2019-10-01T00:47:20,[http://dbpedia.org/resource/Caracas]
4,fe7bf795931d54de09ba4cd50ecfba977b302,7e6eecd2eee91cf250e2858967e8795f,2019-10-01T00:07:33,[http://dbpedia.org/resource/Hulu]
5,fe7bf795931d54de09ba4cd50ecfba977b4683,6b35339ff643b5f7bff6e4edeb6cea93,2019-10-01T02:28:40,[http://dbpedia.org/resource/International_Ene...
6,fe7bf795931d54de09ba4cd50ecfba977b5929,6460b9f0385f7af6e8f769e36ae55360,2019-10-01T03:09:27,[http://dbpedia.org/resource/James_Harden]
7,fe7bf795931d54de09ba4cd50ecfba977b16297,adf9da9791bab931ac5195fb197cb862,2019-10-01T09:40:37,[http://dbpedia.org/resource/Light-on-dark_col...
8,fe7bf795931d54de09ba4cd50ecfba977b27356,63c23496f69cf19dfe612995f85c888a,2019-10-01T16:20:46,[http://dbpedia.org/resource/Manchester_United...
9,fe7bf795931d54de09ba4cd50ecfba977b23313,74e929a19d2515e3124d7a2f99e5dbc9,2019-10-01T14:23:49,[http://dbpedia.org/resource/East_Africa_Time]


# Load cached data (avoid doing expensive data compilation again)

Load MIND topics

In [1]:
import json, pathlib

cache = pathlib.Path("./data_cache/MIND_topics_cache.json")

with cache.open("r", encoding="utf-8") as f:
    qid_to_uri = json.load(f)
topicsLUT = qid_to_uri
qids          = list(qid_to_uri.keys())     # if you need them again
dbpedia_uris_loaded  = list(qid_to_uri.values())

print(f"loaded {len(qid_to_uri):,} DBpedia URIs")
print(dbpedia_uris_loaded[:10])                    # preview


loaded 27,759 DBpedia URIs
['<http://dbpedia.org/resource/Dillsburg,_Pennsylvania>', '<http://dbpedia.org/resource/Duluth_City_Council>', '<http://dbpedia.org/resource/Cody_Whitehair>', '<http://dbpedia.org/resource/Intracoastal_Waterway>', '<http://dbpedia.org/resource/County_highway>', '<http://dbpedia.org/resource/Great_Lakes_Brewing_Company>', '<http://dbpedia.org/resource/Steny_Hoyer>', '<http://dbpedia.org/resource/Rhinebeck_(village),_New_York>', '<http://dbpedia.org/resource/Nissan_Pathfinder>', '<http://dbpedia.org/resource/Dolemite>']


In [2]:
dbpedia_uris = dbpedia_uris_loaded


Load news and entities

In [3]:
import pandas as pd, pathlib
import ast, re


load_path = pathlib.Path("./data_cache/entity_df.pkl")
entity_df = pd.read_pickle(load_path)

print("loaded", len(entity_df), "rows")
# entity_df.head()


entity_df['all_entity_uris'] = entity_df['all_entity_ids'].apply(
    lambda qids: [topicsLUT.get(q) for q in qids]          # None if missing
)
def clean(u):
    if isinstance(u, str):
        return re.sub(r'^.*[/#]', '', u.strip('<>'))
    return None
entity_df['uris_cleaned'] = entity_df['all_entity_uris'].apply(
    lambda uris: [clean(u) for u in uris]
)

entity_df.head()



loaded 51282 rows


Unnamed: 0,news_id,all_entity_ids,all_entity_uris,uris_cleaned
0,N55528,"[Q80976, Q43274, Q9682]","[<http://dbpedia.org/resource/Prince_Philip,_D...","[Prince_Philip,_Duke_of_Edinburgh, Charles_III..."
1,N19639,[Q193583],[<http://dbpedia.org/resource/Adipose_tissue>],[Adipose_tissue]
2,N61837,[Q212],[<http://dbpedia.org/resource/Ukrainian_Soviet...,[Ukrainian_Soviet_Socialist_Republic]
3,N53526,[Q155223],[<http://dbpedia.org/resource/National_Basketb...,[National_Basketball_Association]
4,N38324,"[Q3179593, Q171171, Q371820]","[<http://dbpedia.org/resource/Skin_tag>, <http...","[Skin_tag, Dermatology, Reader's_Digest]"


Import TweetsKb data


In [16]:
import pandas as pd

tweetsKB_df = pd.read_csv(
    './data_cache/tweetsKB_oct2019.csv',
    encoding='utf-8'
)
print("loaded TweetsKB with", len(tweetsKB_df), "records")

loaded TweetsKB with 345 records
