In [14]:
import os
import ast
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# MIND Dataset Import

The Microsoft News Dataset (MIND) contains user behavior and news article data for recommendation systems.

## Setup Instructions:
1. Install Kaggle API: `pip install kaggle`
2. Set up Kaggle credentials (place kaggle.json in ~/.kaggle/)
3. Download dataset: `kaggle datasets download -d arashnic/mind-news-dataset`
4. Extract to your dataset folder

Dataset URL: https://www.kaggle.com/datasets/arashnic/mind-news-dataset

In [15]:
# Import MIND Dataset - Behaviors and News
# Update the path below to point to your extracted MIND dataset folder
MIND_DATA_PATH = "dataset/"  # Change this to your dataset path

print("Loading MIND Dataset...")
print("="*50)

# Load behaviors data
# Format: impression_id, user_id, time, history, impressions
try:
    behaviors_columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']
    behaviors_df = pd.read_csv(
        os.path.join(MIND_DATA_PATH, 'behaviors.tsv'), 
        sep='\t', 
        header=None, 
        names=behaviors_columns,
        encoding='utf-8'
    )
    print(f"✓ Behaviors dataset loaded successfully!")
    print(f"  Shape: {behaviors_df.shape}")
    print(f"  Unique users: {behaviors_df['user_id'].nunique():,}")
    
except FileNotFoundError:
    print("❌ behaviors.tsv not found.")
    print("   Please download from: https://www.kaggle.com/datasets/arashnic/mind-news-dataset")
    behaviors_df = None

# Load news data  
# Format: news_id, category, subcategory, title, abstract, url, title_entities, abstract_entities
try:
    news_columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
    news_df = pd.read_csv(
        os.path.join(MIND_DATA_PATH, 'news.tsv'), 
        sep='\t', 
        header=None, 
        names=news_columns,
        encoding='utf-8'
    )
    print(f"✓ News dataset loaded successfully!")
    print(f"  Shape: {news_df.shape}")
    print(f"  Unique categories: {news_df['category'].nunique()}")
    
except FileNotFoundError:
    print("❌ news.tsv not found.")
    news_df = None

print("\n" + "="*50)

# Display sample data
if behaviors_df is not None:
    print("=== BEHAVIORS DATA SAMPLE ===")
    print(behaviors_df.head())
    print(f"\nData types:\n{behaviors_df.dtypes}")

if news_df is not None:
    print("\n=== NEWS DATA SAMPLE ===")
    print(news_df.head())
    print(f"\nData types:\n{news_df.dtypes}")

Loading MIND Dataset...
✓ Behaviors dataset loaded successfully!
  Shape: (156965, 5)
  Unique users: 50,000
✓ News dataset loaded successfully!
  Shape: (51282, 8)
  Unique categories: 17

=== BEHAVIORS DATA SAMPLE ===
   impression_id user_id                   time  \
0              1  U13740  11/11/2019 9:05:58 AM   
1              2  U91836  11/12/2019 6:11:30 PM   
2              3  U73700  11/14/2019 7:01:48 AM   
3              4  U34670  11/11/2019 5:28:05 AM   
4              5   U8125  11/12/2019 4:11:21 PM   

                                             history  \
0  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  N31739 N6072 N63045 N23979 N35656 N43353 N8129...   
2  N10732 N25792 N7563 N21087 N41087 N5445 N60384...   
3  N45729 N2203 N871 N53880 N41375 N43142 N33013 ...   
4                        N10078 N56514 N14904 N33740   

                                         impressions  
0                                  N55689-1 N35729-0  
1  N20678-0 N39317-0 N5811

In [23]:
news_df['title_entities'][0]

'[{"Label": "Prince Philip, Duke of Edinburgh", "Type": "P", "WikidataId": "Q80976", "Confidence": 1.0, "OccurrenceOffsets": [48], "SurfaceForms": ["Prince Philip"]}, {"Label": "Charles, Prince of Wales", "Type": "P", "WikidataId": "Q43274", "Confidence": 1.0, "OccurrenceOffsets": [28], "SurfaceForms": ["Prince Charles"]}, {"Label": "Elizabeth II", "Type": "P", "WikidataId": "Q9682", "Confidence": 0.97, "OccurrenceOffsets": [11], "SurfaceForms": ["Queen Elizabeth"]}]'

In [22]:
# core
import pathlib, random, gzip, re
from typing import List, Tuple

# RDF handling
import rdflib

# progress bar (optional: pip install tqdm)
from tqdm.auto import tqdm

# ── I/O paths & parameters ─────────────────────────────────────
INPUT   = pathlib.Path("dataset/month_2019-10.n3")     # .n3  *or*  .n3.gz
FRACTION = 0.10        # keep 10 %
SEED     = 2025        # set None for non‑reproducible sampling
SHOW_BAR = True        # turn off if you dislike progress bars

def open_text(path: pathlib.Path, mode="rt"):
    """
    Open a *.n3* or *.n3.gz* as UTF‑8 text.
    Only the gzip *stream* is decompressed – nothing ever written to disk.
    """
    return (
        gzip.open(path, mode, encoding="utf-8", newline="")
        if path.suffix == ".gz"
        else path.open(mode, encoding="utf-8", newline="")
    )

_SUBJ_RE = re.compile(r"^<([^>]+)>")   # subject IRI at start of a triple line


def sample_to_graph(
    infile: pathlib.Path,
    frac: float = FRACTION,
    seed: int | None = SEED,
    show_progress: bool = SHOW_BAR
) -> rdflib.Graph:
    """
    One‑pass Bernoulli sampling:
    • Reads the dump line‑by‑line (constant memory).
    • Treats consecutive triples with the same subject IRI as *one tweet*.
    • Keeps each tweet with probability *frac* and parses it into an rdflib.Graph.
    Returns the populated Graph.
    """
    if not 0 < frac < 1:
        raise ValueError("`frac` must be in (0, 1).")
    if seed is not None:
        random.seed(seed)

    g = rdflib.Graph()
    current_subj: str | None = None
    buffer: List[str] = []

    # rough line count → nicer tqdm bar (skip for >10 GB to stay cheap)
    total_lines = None
    if show_progress and infile.stat().st_size < 10_000_000_000:
        with open_text(infile, "rt") as probe:
            total_lines = sum(1 for _ in probe)

    with open_text(infile, "rt") as fin:
        iterator = tqdm(fin, total=total_lines, unit="lines") if show_progress else fin

        for line in iterator:
            if not line.strip():                      # skip blank lines
                continue

            subj_match = _SUBJ_RE.match(line)
            subj = subj_match.group(1) if subj_match else None

            # still within the same tweet?
            if current_subj is None or subj == current_subj:
                buffer.append(line)
                current_subj = subj
                continue

            # tweet boundary reached → maybe add the buffered tweet to Graph
            if random.random() < frac:
                g.parse(data="".join(buffer), format="n3")
            buffer = [line]                           # start next tweet
            current_subj = subj

        # handle final buffered tweet (EOF)
        if buffer and random.random() < frac:
            g.parse(data="".join(buffer), format="n3")

    print(f"✅ Sampling done – Graph contains {len(g):,} triples.")
    return g

tweets_graph = sample_to_graph(INPUT)


259388072lines [02:06, 2048560.37lines/s]



✅ Sampling done – Graph contains 0 triples.


Get all WIKI IDs.

In [25]:
import json

def extract_entity_ids(row):
    ids = set()
    # Process both title_entities and abstract_entities
    for field in ['title_entities', 'abstract_entities']:
        value = row.get(field, None)
        # If it's a JSON string, parse it
        if isinstance(value, str):
            try:
                entities = json.loads(value)
            except json.JSONDecodeError:
                entities = []
        # If it's already a list (e.g. pre‑parsed), use it directly
        elif isinstance(value, list):
            entities = value
        # Otherwise (NaN, None, float, etc.), skip
        else:
            entities = []
        # Extract WikidataId from each entity dict
        for ent in entities:
            wid = ent.get('WikidataId')
            if isinstance(wid, str):
                ids.add(wid)
    return list(ids)

# Apply to add the new column without repeats
news_df['all_entity_ids'] = news_df.apply(extract_entity_ids, axis=1)

# (Optional) Verify the result
print(news_df[['news_id', 'all_entity_ids']].head())


  news_id                all_entity_ids
0  N55528       [Q43274, Q80976, Q9682]
1  N19639                     [Q193583]
2  N61837                        [Q212]
3  N53526                     [Q155223]
4  N38324  [Q171171, Q3179593, Q371820]


In [None]:
# Method 1: Using a set comprehension
unique_ids = list({
    eid
    for ids in news_df['all_entity_ids']
    for eid in ids
})
print(unique_ids)

# Method 2: Iteratively updating a set
unique_ids_set = set()
for ids in news_df['all_entity_ids']:
    unique_ids_set.update(ids)
unique_ids = list(unique_ids_set)
print("Number of unique entity IDs:", len(unique_ids))
print(len(unique_ids))


['Q1130536', 'Q54110533', 'Q1922815', 'Q7460947', 'Q40816', 'Q1926987', 'Q617392', 'Q862148', 'Q16902626', 'Q754085', 'Q461761', 'Q5105990', 'Q223243', 'Q6145814', 'Q606169', 'Q228598', 'Q1398761', 'Q2985114', 'Q1083295', 'Q8021208', 'Q6689402', 'Q16762856', 'Q5303358', 'Q2471422', 'Q2365823', 'Q178750', 'Q7567996', 'Q309590', 'Q3026879', 'Q21562062', 'Q6879721', 'Q714646', 'Q6830166', 'Q6063925', 'Q19881717', 'Q7772825', 'Q1133834', 'Q15216588', 'Q1771875', 'Q164134', 'Q586285', 'Q62027184', 'Q4163743', 'Q16196038', 'Q6386365', 'Q169948', 'Q18740945', 'Q495845', 'Q861206', 'Q4955939', 'Q2743544', 'Q269110', 'Q3492300', 'Q2161311', 'Q42005', 'Q349892', 'Q165325', 'Q64744044', 'Q6756362', 'Q5919677', 'Q16919664', 'Q1461', 'Q959885', 'Q3037967', 'Q465811', 'Q7954442', 'Q1255508', 'Q5359555', 'Q1630304', 'Q156291', 'Q5211451', 'Q5260456', 'Q5161478', 'Q1017468', 'Q7700055', 'Q49078', 'Q19947779', 'Q487907', 'Q1013717', 'Q3449348', 'Q3644642', 'Q1478077', 'Q44696', 'Q25274', 'Q378878', 'Q7