## Count Interactions

In [None]:
with open("ml-1m.txt", "r") as file:
    print(f"Total Interactions: {sum(1 for line in file)}")

Total Interactions: 999611


In [None]:
with open("ratings.dat","r") as f:
    print(f"Total Interactions: {sum(1 for line in f)}")

Total Interactions: 1000209


## Count user and item frequencies

In [None]:
# from collections import defaultdict

# # Load data
# data = []
# with open("Steam.txt", "r") as file:
#     for line in file:
#         user, item = line.strip().split()
#         data.append((user, item))

# # Count user and item frequencies
# user_freq = defaultdict(int)
# item_freq = defaultdict(int)
# for user, item in data:
#     user_freq[user] += 1
#     item_freq[item] += 1

## Filtering

In [None]:
# import pandas as pd

# # โหลดข้อมูล
# file_path = "ml-1m.txt"
# df = pd.read_csv(file_path, sep=" ", header=None, names=["user_id", "item_id"])

# # กรองเฉพาะ user_id ที่มี interaction เกิน 5
# user_interactions = df['user_id'].value_counts()
# valid_user_ids = user_interactions[user_interactions > 5].index
# filtered_df = df[df['user_id'].isin(valid_user_ids)]

# # สุ่ม user_id แบบสุ่มลำดับ
# sample_user_ids = filtered_df['user_id'].drop_duplicates().sample(frac=1, random_state=42)
# selected_records = pd.DataFrame(columns=filtered_df.columns)

# # เลือกข้อมูลโดยไม่เกิน 100,000 records
# for user_id in sample_user_ids:
#     user_data = filtered_df[filtered_df['user_id'] == user_id]
#     if len(selected_records) + len(user_data) > 100000:
#         break
#     selected_records = pd.concat([selected_records, user_data])

# # เก็บลำดับเดิม
# selected_records = selected_records.sort_index()

# # สร้าง mapping ใหม่ของ user_id เป็น 1, 2, 3, ..., n
# unique_user_ids = selected_records['user_id'].drop_duplicates().reset_index(drop=True)
# user_id_mapping = {old_id: new_id + 1 for new_id, old_id in enumerate(unique_user_ids)}
# selected_records['user_id'] = selected_records['user_id'].map(user_id_mapping)

# # บันทึกไฟล์ใหม่
# selected_records.to_csv("ml-1m_random.txt", sep=" ", index=False, header=False)


In [None]:
import pandas as pd
import random
import numpy as np

# โหลดข้อมูลจากไฟล์ ratings.dat
file_path = 'ml-1m-ratings.dat'  # เปลี่ยน path ตามที่เก็บไฟล์
columns = ['user_id', 'item_id', 'rating', 'timestamp']

# อ่านไฟล์ด้วยการแยก "::"
ratings_df = pd.read_csv(file_path, sep='::', names=columns, engine='python')

# กรองเฉพาะ user_id ที่มี interaction > 5
user_interactions = ratings_df['user_id'].value_counts()
valid_user_ids = user_interactions[user_interactions > 5].index
filtered_df = ratings_df[ratings_df['user_id'].isin(valid_user_ids)]

total_interactions = 0
selected_user = []
while True:
    random_user = random.choice(filtered_df['user_id'].unique())
    while random_user in selected_user: random_user = random.choice(filtered_df['user_id'].unique())
    selected_user.append(random_user)
    each_interactions = filtered_df[filtered_df['user_id'] == random_user]
    total_interactions += len(each_interactions)
    if total_interactions > 100000:
        break
print(selected_user)

filtered_df = filtered_df[filtered_df['user_id'].isin(selected_user)]

# สุ่ม interactions โดยตรง
sampled_interactions = filtered_df.sample(n=min(100000, len(filtered_df)), random_state=42)

# จัดเรียงตาม timestamp
selected_records = sampled_interactions.sort_values(by=['timestamp']).reset_index(drop=True)

# ตรวจสอบว่า user_id ใน selected_records มี interaction ขั้นต่ำ 5
valid_user_ids = selected_records['user_id'].value_counts()
valid_user_ids = valid_user_ids[valid_user_ids >= 5].index
selected_records = selected_records[selected_records['user_id'].isin(valid_user_ids)].reset_index(drop=True)

# สร้าง mapping ใหม่สำหรับ user_id และ item_id
unique_user_ids = selected_records['user_id'].drop_duplicates().reset_index(drop=True)
user_id_mapping = {old_id: new_id + 1 for new_id, old_id in enumerate(unique_user_ids)}
selected_records['user_id'] = selected_records['user_id'].map(user_id_mapping)

unique_item_ids = selected_records['item_id'].drop_duplicates().reset_index(drop=True)
item_id_mapping = {old_id: new_id + 1 for new_id, old_id in enumerate(unique_item_ids)}
selected_records['item_id'] = selected_records['item_id'].map(item_id_mapping)

# แบ่งข้อมูลเป็น train และ test สำหรับ NCF
train_data = []
test_data = []
negative_samples = []

print(max(selected_records['item_id'].unique()))
all_items = set(range(max(selected_records['item_id'].unique())))

for user_id in selected_records['user_id'].unique():
    user_data = selected_records[selected_records['user_id'] == user_id]
    items = user_data['item_id'].tolist()
    ratings = user_data['rating'].tolist()
    timestamps = user_data['timestamp'].tolist()

    # ใช้รายการสุดท้ายเป็น test
    test_data.append([user_id, items[-1], ratings[-1], timestamps[-1]])

    # รายการอื่นใช้เป็น train
    for item, rating, timestamp in zip(items[:-1], ratings[:-1], timestamps[:-1]):
        train_data.append([user_id, item, rating, timestamp])

    # สุ่ม negative samples
    interacted_items = set(items)
    negative_items = list(all_items - interacted_items)
    negative_samples.append([user_id] + list(np.random.choice(negative_items, size=99, replace=False)))

# สร้าง DataFrame สำหรับ train และ test
train_df = pd.DataFrame(train_data, columns=['user_id', 'item_id', 'rating', 'timestamp'])
test_df = pd.DataFrame(test_data, columns=['user_id', 'item_id', 'rating', 'timestamp'])

# บันทึกไฟล์สำหรับ NCF (รวม rating และ timestamp)
train_df.to_csv("ml-sample-100k.train.rating", sep="\t", index=False, header=False)
test_df.to_csv("ml-sample-100k.test.rating", sep="\t", index=False, header=False)

# สร้าง negative samples ในฟอร์แมตที่กำหนด
negative_samples_formatted = []
for user_id, negatives in zip([x[0] for x in negative_samples], [x[1:] for x in negative_samples]):
    negatives_line = f"({user_id},{test_df[test_df['user_id'] == user_id]['item_id'].iloc[0]})\t" + "\t".join(map(str, negatives))
    negative_samples_formatted.append(negatives_line)

# บันทึกไฟล์ ncf_test.negative
with open("ml-sample-100k.test.negative", "w") as f:
    f.write("\n".join(negative_samples_formatted))

# บันทึกไฟล์สำหรับ SASRec โดยจัดเรียงตาม user_id และ timestamp
sasrec_data = selected_records[['user_id', 'item_id', 'timestamp']]
sasrec_data = sasrec_data.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
sasrec_data[['user_id', 'item_id']].to_csv("ml-sample-100k.txt", sep=" ", index=False, header=False)


[4982, 884, 5222, 1116, 1798, 3772, 4024, 3349, 4634, 2649, 427, 4253, 4638, 1604, 5932, 4753, 5954, 1359, 3425, 1073, 2913, 402, 790, 4856, 5429, 3566, 5926, 910, 1145, 4964, 3301, 5843, 2117, 3893, 771, 2972, 720, 5191, 3142, 1991, 1739, 1972, 2299, 2921, 5175, 1607, 2835, 3654, 1559, 5329, 4425, 1072, 4774, 5395, 5305, 3852, 5874, 3681, 2353, 1542, 219, 1860, 5054, 4193, 390, 1485, 5113, 3499, 4316, 1428, 1827, 1732, 3273, 4049, 2222, 801, 4861, 5677, 56, 1866, 3969, 1903, 4608, 1497, 3792, 2087, 5651, 717, 1067, 1484, 2426, 2946, 3492, 974, 1293, 4631, 2066, 130, 919, 13, 1332, 1624, 1473, 5115, 5239, 824, 4465, 1012, 3131, 5829, 4131, 4732, 743, 1163, 3262, 4236, 2231, 1260, 3073, 5501, 2900, 1666, 1000, 1741, 4817, 4116, 2845, 5279, 3192, 870, 1312, 457, 2192, 3545, 590, 4747, 4038, 3867, 3463, 1121, 2251, 1958, 3688, 2161, 2198, 2023, 1611, 3758, 4613, 1834, 5613, 1642, 1889, 3584, 1, 3487, 2331, 2536, 1488, 2896, 5255, 5275, 2108, 2440, 192, 5806, 4334, 3189, 72, 1687, 2226, 14