In [5]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
from tqdm import tqdm

In [7]:
file_path = os.path.join('Dataset', 'data.txt')

raw_data = []
with open(file_path, 'r') as f:
    # Estimate lines for progress bar (optional, but nice)
    for line in tqdm(f, desc="Parsing Lines"):
        parts = line.split()
        if len(parts) < 2: continue
        
        user_id = int(parts[0])
        # Create a row for every item
        for item_id in parts[1:]:
            raw_data.append((user_id, int(item_id)))

# 2. Convert to DataFrame
df = pd.DataFrame(raw_data, columns=['user', 'item'])

print(f" Loaded into DataFrame. Shape: {df.shape}")
print(df.head())

Loading Dataset\data.txt...


Parsing Lines: 52644it [00:02, 21486.37it/s]


 Loaded into DataFrame. Shape: (2380730, 2)
   user   item
0     0  28261
1     0    388
2     0   5731
3     0    401
4     0  28284


In [9]:
# Cell 2: Iterative K-Core Filtering

# Define your threshold (e.g., 5 or 10)
MIN_INTERACTIONS = 5 

print(f"Starting K-Core Filtering (min={MIN_INTERACTIONS})...")

# Loop until convergence (no more users/items removed)
while True:
    start_len = len(df)

    # 1. Filter Users
    u_counts = df['user'].value_counts()
    df = df[df['user'].isin(u_counts[u_counts >= MIN_INTERACTIONS].index)]

    # 2. Filter Items
    i_counts = df['item'].value_counts()
    df = df[df['item'].isin(i_counts[i_counts >= MIN_INTERACTIONS].index)]

    # Stop if nothing changed
    if len(df) == start_len:
        break

print(f"Final Data: {len(df):,} interactions.")

Starting K-Core Filtering (min=5)...
Final Data: 2,372,615 interactions.


In [11]:
# Cell 3: Map IDs and Create Sparse Matrix
import scipy.sparse as sp
import numpy as np

# 1. Map to 0..N Indices
# Create new columns for the matrix indices
df['user_idx'] = df['user'].astype('category').cat.codes
df['item_idx'] = df['item'].astype('category').cat.codes

# 2. Create Lookup Dictionaries (To translate back later)
user_map = dict(zip(df['user_idx'], df['user']))
item_map = dict(zip(df['item_idx'], df['item']))

# 3. Create the Sparse Matrix (CSR Format)
n_users = df['user_idx'].max() + 1
n_items = df['item_idx'].max() + 1

print(f"Matrix Dimensions: {n_users:,} Users x {n_items:,} Items")

rows = df['user_idx'].values
cols = df['item_idx'].values
data = np.ones(len(df)) # Implicit feedback (1 = interacted)

matrix = sp.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

print("Sparse Matrix created.")

Matrix Dimensions: 52,642 Users x 88,416 Items
Sparse Matrix created.


In [13]:
# Cell 4: Split Data 80/20 (Masking)

print("Splitting Data 80/20...")

# Set seed for reproducibility
np.random.seed(42)

# Copy matrix to create Train and Test sets
train_matrix = matrix.copy()
test_matrix = matrix.copy()

# Generate a random mask: True = Test (20%), False = Train (80%)
mask = np.random.rand(len(matrix.data)) < 0.2

# Create Train: Zero out the Test items (apply mask where True)
train_matrix.data[mask] = 0
train_matrix.eliminate_zeros()

# Create Test: Zero out the Train items (apply mask where False)
test_matrix.data[~mask] = 0
test_matrix.eliminate_zeros()

print(f"Train Interactions: {train_matrix.nnz:,}")
print(f"Test Interactions:  {test_matrix.nnz:,}")

Splitting Data 80/20...
Train Interactions: 1,898,587
Test Interactions:  474,028


In [15]:
# Cell 5: Save Matrices and Maps
import scipy.sparse as sp
import pickle

print("Saving data to disk...")

# 1. Save Matrices (Compressed NPZ format)
sp.save_npz('train_matrix.npz', train_matrix)
sp.save_npz('test_matrix.npz', test_matrix)

# 2. Save Mappings (Pickle format)
with open('user_map.pkl', 'wb') as f:
    pickle.dump(user_map, f)
    
with open('item_map.pkl', 'wb') as f:
    pickle.dump(item_map, f)

print(" Successfully saved:")
print(" - train_matrix.npz")
print(" - test_matrix.npz")
print(" - user_map.pkl")
print(" - item_map.pkl")

Saving data to disk...
 Successfully saved:
 - train_matrix.npz
 - test_matrix.npz
 - user_map.pkl
 - item_map.pkl


In [1]:
# Cell 1: Imports & Load Data
import pickle
import numpy as np
import scipy.sparse as sp
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm

print("Loading data from disk...")

# 1. Load Matrices
train_matrix = sp.load_npz('train_matrix.npz')
test_matrix = sp.load_npz('test_matrix.npz')

# 2. Load Maps
with open('user_map.pkl', 'rb') as f:
    user_map = pickle.load(f)
    
with open('item_map.pkl', 'rb') as f:
    item_map = pickle.load(f)

print(f"Loaded Train Matrix: {train_matrix.shape}")
print(f"Loaded Test Matrix:  {test_matrix.shape}")

Loading data from disk...
✅ Loaded Train Matrix: (52642, 88416)
✅ Loaded Test Matrix:  (52642, 88416)
