#STEP 1: Mount Drive and Load Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/RS_Latest/train-2.txt'

with open(file_path, 'r') as f:
    lines = f.readlines()

print(f"Loaded {len(lines)} users from train-2.txt")


Mounted at /content/drive
Loaded 52644 users from train-2.txt


# STEP 2: Parse and Create Mappings

In [7]:
import numpy as np
from collections import defaultdict
from scipy.sparse import lil_matrix

# Parse interactions
interactions = []
for line in lines:
    items = line.strip().split()
    if len(items) < 2:
        continue

    user_id = int(items[0])
    item_ids = [int(x) for x in items[1:]]

    for item_id in item_ids:
        interactions.append((user_id, item_id))

print(f"Total interactions: {len(interactions):,}")

# Create mappings
users = sorted(set([u for u, i in interactions]))
items = sorted(set([i for u, i in interactions]))

user2idx = {u: idx for idx, u in enumerate(users)}
item2idx = {i: idx for idx, i in enumerate(items)}
idx2user = {idx: u for u, idx in user2idx.items()}
idx2item = {idx: i for i, idx in item2idx.items()}

n_users = len(users)
n_items = len(items)

print(f"Users: {n_users:,}, Items: {n_items:,}")


Total interactions: 2,380,730
Users: 52,643, Items: 91,599


#STEP 3: Create Full Matrix

In [8]:
# Create full interaction matrix
full_matrix = lil_matrix((n_users, n_items), dtype=np.float32)

for user_id, item_id in interactions:
    u_idx = user2idx[user_id]
    i_idx = item2idx[item_id]
    full_matrix[u_idx, i_idx] = 1.0

full_matrix = full_matrix.tocsr()

print(f"Matrix: {full_matrix.shape}, Non-zero: {full_matrix.nnz:,}")


Matrix: (52643, 91599), Non-zero: 2,380,730


#STEP 4: Smart Train/Test Split

In [9]:
# BETTER SPLIT: Keep more recent items for testing
train_matrix = lil_matrix((n_users, n_items), dtype=np.float32)
test_dict = {}

np.random.seed(42)

for u_idx in range(n_users):
    items_idx = full_matrix[u_idx].nonzero()[1]

    if len(items_idx) < 5:
        # Keep all for users with few items
        for i_idx in items_idx:
            train_matrix[u_idx, i_idx] = 1.0
        continue

    # Shuffle
    np.random.shuffle(items_idx)

    # Split: Keep at least 3 items for training
    n_test = min(len(items_idx) // 4, len(items_idx) - 3)

    test_items = items_idx[:n_test]
    train_items = items_idx[n_test:]

    for i_idx in train_items:
        train_matrix[u_idx, i_idx] = 1.0

    if len(test_items) > 0:
        test_dict[u_idx] = test_items.tolist()

train_matrix = train_matrix.tocsr()

print(f"Train: {train_matrix.nnz:,}, Test users: {len(test_dict):,}")


Train: 1,800,056, Test users: 52,643


#STEP 5: Install and Train OPTIMIZED ALS

In [10]:
!pip install -q implicit

from implicit.als import AlternatingLeastSquares
import time

print("Training OPTIMIZED ALS...")

als_model = AlternatingLeastSquares(
    factors=300,              # Very high capacity
    regularization=0.0001,    # Very low regularization
    iterations=50,            # More iterations
    alpha=1.0,
    num_threads=4,
    random_state=42
)

start = time.time()
als_model.fit(train_matrix.T * 150)  # Very high confidence
print(f"Training completed in {time.time()-start:.1f}s")

# Extract factors
if hasattr(als_model.user_factors, 'to_numpy'):
    user_factors = als_model.user_factors.to_numpy()
    item_factors = als_model.item_factors.to_numpy()
else:
    user_factors = np.array(als_model.user_factors)
    item_factors = np.array(als_model.item_factors)

# Swap if needed
if user_factors.shape[0] != n_users:
    user_factors, item_factors = item_factors, user_factors

print(f"Factors: User={user_factors.shape}, Item={item_factors.shape}")


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone


  check_blas_config()


Training OPTIMIZED ALS...


  0%|          | 0/50 [00:00<?, ?it/s]

Training completed in 2678.6s
Factors: User=(52643, 300), Item=(91599, 300)


#STEP 6: Quick Evaluation

In [11]:
# Test on sample users
print("\nQuick evaluation on 1000 users...")

ndcg_scores = []
hit_counts = []

for u_idx in list(test_dict.keys())[:1000]:
    # Get scores
    scores = user_factors[u_idx] @ item_factors.T

    # Mask training
    train_items = train_matrix[u_idx].nonzero()[1]
    scores[train_items] = -np.inf

    # Top 20
    top_20 = np.argsort(-scores)[:20]

    # Hits
    hits = [i for i in top_20 if i in test_dict[u_idx]]
    hit_counts.append(len(hits))

    # NDCG
    rel = [1 if i in test_dict[u_idx] else 0 for i in top_20]
    dcg = sum([r / np.log2(i+2) for i, r in enumerate(rel)])
    idcg = sum([r / np.log2(i+2) for i, r in enumerate(sorted(rel, reverse=True))])

    if idcg > 0:
        ndcg_scores.append(dcg / idcg)

print(f"\nRESULTS:")
print(f"  NDCG@20: {np.mean(ndcg_scores):.4f}")
print(f"  Avg hits per user: {np.mean(hit_counts):.2f}")
print(f"  Users with hits: {sum([1 for h in hit_counts if h > 0])}/1000")



Quick evaluation on 1000 users...

RESULTS:
  NDCG@20: 0.4866
  Avg hits per user: 1.42
  Users with hits: 642/1000


#STEP 7: Generate Final Submission

In [12]:
print("\nGenerating submission file...")

output_path = '/content/drive/MyDrive/RS_Latest/final_submission.txt'

with open(output_path, 'w') as f:
    for u_idx in range(n_users):
        # Calculate scores
        scores = user_factors[u_idx] @ item_factors.T

        # Mask training
        train_items = train_matrix[u_idx].nonzero()[1]
        scores[train_items] = -np.inf

        # Handle edge case: if all scores are -inf
        if np.all(np.isinf(scores)):
            # Use item popularity
            item_pop = np.array(train_matrix.sum(axis=0)).flatten()
            top_20_idx = np.argsort(-item_pop)[:20]
        else:
            top_20_idx = np.argsort(-scores)[:20]

        # Convert to original IDs
        top_20_items = [idx2item[i] for i in top_20_idx]

        # Write line
        user_id = idx2user[u_idx]
        line = str(user_id) + " " + " ".join(map(str, top_20_items))
        f.write(line + "\n")

        if (u_idx + 1) % 10000 == 0:
            print(f"  {u_idx+1:,}/{n_users:,}")

print(f"\nSubmission saved: {output_path}")

# Verify
with open(output_path, 'r') as f:
    lines = f.readlines()

print(f"Total lines: {len(lines):,}")
print(f"\nFirst 3 lines:")
for i in range(3):
    parts = lines[i].strip().split()
    print(f"  User {parts[0]}: {len(parts)-1} items")

print("\nDONE - Ready for submission!")



Generating submission file...
  10,000/52,643
  20,000/52,643
  30,000/52,643
  40,000/52,643
  50,000/52,643

Submission saved: /content/drive/MyDrive/RS_Latest/final_submission.txt
Total lines: 52,643

First 3 lines:
  User 0: 20 items
  User 1: 20 items
  User 2: 20 items

DONE - Ready for submission!
