In [13]:
# Cell 1: Imports
import os
import pandas as pd
from pathlib import Path
from typing import List, Tuple

In [15]:
INPUT = Path('Dataset') / 'data.txt'
OUTPUT = Path('Processed_Data')

In [17]:
raw_data = []
with open(INPUT, 'r') as f:
    for line in f:
        parts = line.split()
        if len(parts) < 2: continue
        
        user_id = int(parts[0])
        for item_id in parts[1:]:
            raw_data.append((user_id, int(item_id)))

df = pd.DataFrame(raw_data, columns=['user', 'item'])

print(f"Loaded {len(df):,} interactions.")

Loaded 2,380,730 interactions.


In [23]:
def k_core_filter(df: pd.DataFrame, k: int) -> pd.DataFrame:
    """
    Iteratively removes users and items with fewer than k interactions
    until convergence (stable state).
    """
    df_clean = df.copy()
    iteration = 0
    
    while True:
        start_len = len(df_clean)
        
        # Filter Users
        user_counts = df_clean['user'].value_counts()
        valid_users = user_counts[user_counts >= k].index
        df_clean = df_clean[df_clean['user'].isin(valid_users)]
        
        # Filter Items
        item_counts = df_clean['item'].value_counts()
        valid_items = item_counts[item_counts >= k].index
        df_clean = df_clean[df_clean['item'].isin(valid_items)]
        
        if len(df_clean) == start_len:
            break
        iteration += 1
        
    return df_clean

In [25]:
def save_txt(df: pd.DataFrame, output_path: Path):
    """
    Aggregates DataFrame by User and writes to text format:
    User Item1 Item2 Item3...
    """
    user_interactions = df.groupby('user')['item'].apply(list)
    
    with open(output_path, 'w') as f:
        for user_id, items in user_interactions.items():
            item_str = " ".join(map(str, items))
            f.write(f"{user_id} {item_str}\n")

In [33]:
THRESHOLDS = [2, 3, 5]

OUTPUT.mkdir(parents=True, exist_ok=True)

print(f"{'Threshold':<10} {'Interactions':<15} {'Users':<10} {'Items':<10} {'Status':<10}")
print("-" * 65)

for k in THRESHOLDS:
    df_clean = k_core_filter(df, k)
    
    n_interactions = len(df_clean)
    n_users = df_clean['user'].nunique()
    n_items = df_clean['item'].nunique()
    
    filename = f"data_k{k}.txt"
    output_path = OUTPUT / filename
    
    save_txt(df_clean, output_path)
    
    print(f"K={k:<9} {n_interactions:<15,} {n_users:<10,} {n_items:<10,} {'Saved':<10}")


Threshold  Interactions    Users      Items      Status    
-----------------------------------------------------------------
K=2         2,379,949       52,643     90,818     Saved     
K=3         2,378,453       52,643     90,070     Saved     
K=5         2,372,615       52,642     88,416     Saved     
