In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import json

# Load the datasets
item_embedding = pd.read_csv('datasets/kaggle/item_embeddings.csv')
course_list = pd.read_csv('datasets/kaggle/course_list.csv')
rating_df = pd.read_csv('datasets/kaggle/rating_df.csv')

# Ensure user IDs are strings
rating_df['user'] = rating_df['user'].astype(str)

# Create a mapping from course code to a unique integer starting from 1
course_id_map = {code: idx + 1 for idx, code in enumerate(course_list['Course List Code'])}
course_name_map = dict(zip(course_list['Course List Code'], course_list['Course Name']))

# Create a mapping from user to a unique integer
user_id_map = {str(uid): idx for idx, uid in enumerate(rating_df['user'].unique())}

# Apply the mappings to the rating data
rating_df['user_id_mapped'] = rating_df['user'].map(user_id_map)
rating_df['course_id_mapped'] = rating_df['item'].map(course_id_map)

# Drop rows with NaN values in the mapped columns (if mapping fails)
rating_df.dropna(subset=['user_id_mapped', 'course_id_mapped'], inplace=True)

# Ensure 'rating' column is binary (0 or 1)
rating_df['rating'] = rating_df['rating'].apply(lambda x: 1 if x > 0 else 0)

# Only keep relevant columns
rating_df = rating_df[['user_id_mapped', 'course_id_mapped', 'rating']]

# Convert mapped IDs to integers
rating_df['user_id_mapped'] = rating_df['user_id_mapped'].astype(int)
rating_df['course_id_mapped'] = rating_df['course_id_mapped'].astype(int)

# Initialize GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Split the data into train and temp sets using GroupShuffleSplit (ensuring users are not split)
for train_index, temp_index in gss.split(rating_df, groups=rating_df['user_id_mapped']):
    train = rating_df.iloc[train_index]
    temp = rating_df.iloc[temp_index]

# Split the temp set into validation and test sets using GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for val_index, test_index in gss.split(temp, groups=temp['user_id_mapped']):
    val = temp.iloc[val_index]
    test = temp.iloc[test_index]

# Save the train.txt file
train_file = 'datasets/kaggle/train.txt'
train.to_csv(train_file, sep=' ', header=False, index=False)

# Save the val.txt file
val_file = 'datasets/kaggle/val.txt'
val.to_csv(val_file, sep=' ', header=False, index=False)

# Save the test.txt file
test_file = 'datasets/kaggle/test.txt'
test.to_csv(test_file, sep=' ', header=False, index=False)

# Save the text.txt file (course id and name mapping)
text_file = 'datasets/kaggle/text.txt'
with open(text_file, 'w') as f:
    for code, name in course_name_map.items():
        idx = course_id_map[code]
        f.write(f"{idx} {name}\n")

print("Preprocessing complete. Files saved as train.txt, val.txt, test.txt, and text.txt.")

# Save the mappings for future reference
with open('datasets/kaggle/user_id_map.json', 'w') as f:
    json.dump({str(k): int(v) for k, v in user_id_map.items()}, f)

with open('datasets/kaggle/course_id_map.json', 'w') as f:
    json.dump({str(k): int(v) for k, v in course_id_map.items()}, f)

print("User and course ID mappings saved as JSON files.")


Preprocessing complete. Files saved as train.txt, val.txt, test.txt, and text.txt.
User and course ID mappings saved as JSON files.


In [2]:
train_instances = len(train)
val_instances = len(val)
test_instances = len(test)
print(f"Training instances: {train_instances}")
print(f"Validation instances: {val_instances}")
print(f"Testing instances: {test_instances}")

Training instances: 163314
Validation instances: 34996
Testing instances: 34996


In [3]:
total_split_instances = train_instances + val_instances + test_instances
print(f"Total instances after splitting: {total_split_instances}")
# print(f"Matches original cleaned dataset: {total_split_instances == cleaned_instances}")

Total instances after splitting: 233306


In [6]:
import pandas as pd
import numpy as np
import json

# Load the item embedding CSV file
item_embedding_df = pd.read_csv('datasets/kaggle/item_embeddings.csv')

# Load the course ID mapping from the previous script
with open('datasets/kaggle/course_id_map.json', 'r') as f:
    course_id_map = json.load(f)

# Create a new DataFrame with the correct course IDs
new_item_embedding_df = pd.DataFrame()
new_item_embedding_df['course_id'] = item_embedding_df['Course Name'].map(course_id_map)

# Add the embeddings to the new DataFrame
embedding_columns = item_embedding_df.columns[1:]  # Assuming the first column is 'Course List Code'
new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]

# Sort the DataFrame by the new course IDs
new_item_embedding_df = new_item_embedding_df.sort_values('course_id')

# Extract the embeddings
course_embeddings = new_item_embedding_df[embedding_columns].values

# Save the embeddings as a .npy file
np.save('datasets/kaggle/course_embeddings.npy', course_embeddings)

print("course_embeddings.npy file created successfully!")

# Save the course IDs separately
course_ids = new_item_embedding_df['course_id'].values
np.save('datasets/kaggle/course_ids.npy', course_ids)

print("course_ids.npy file created successfully!")

# Optionally, you can also save the original course codes
course_codes = item_embedding_df['Course Name'].values
np.save('datasets/kaggle/course_codes.npy', course_codes)

print("course_codes.npy file created successfully!")

course_embeddings.npy file created successfully!
course_ids.npy file created successfully!
course_codes.npy file created successfully!


  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embedding_df[embedding_columns] = item_embedding_df[embedding_columns]
  new_item_embed

In [7]:
import numpy as np
import pandas as pd

# อ่านไฟล์ CSV
item_embeddings_df = pd.read_csv("datasets/kaggle/item_embeddings.csv")

# สร้างคอลัมน์ 'Course Index' ที่เริ่มจาก 1
item_embeddings_df['Course Index'] = np.arange(1, len(item_embeddings_df) + 1)

# ลบคอลัมน์ 'Course Name' เพื่อดึงเฉพาะข้อมูล embedding
precomputed_embeddings = item_embeddings_df.drop(columns=['Course Name']).values

# บันทึก embeddings เป็นไฟล์ .npy
np.save("datasets/kaggle/precomputed_embeddings.npy", precomputed_embeddings)

print("Precomputed embeddings saved to 'datasets/kaggle/precomputed_embeddings.npy'")


Precomputed embeddings saved to 'datasets/kaggle/precomputed_embeddings.npy'
