In [1]:
import pandas as pd

# Read the CSV files
df1 = pd.read_csv('../thairobotics/anonymized_combined_output_with_categories.csv')
df2 = pd.read_csv('../coursera/filtered_usage_course.csv')

# Select relevant columns from df1
df1_selected = df1[['Email', 'Course Name', 'Category']]

# Select relevant columns from df2 and rename them to match df1
df2_selected = df2[['Email', 'Course', 'Domain']]
df2_selected.columns = ['Email', 'Course Name', 'Category']

# Concatenate the DataFrames
df_combined = pd.concat([df1_selected, df2_selected])

# Print the resulting DataFrame
print(df_combined)

                           Email               Course Name  \
0                 user1@anon.com    Support Vector Machine   
1                 user2@anon.com    Support Vector Machine   
2                 user3@anon.com    Support Vector Machine   
3                 user4@anon.com    Support Vector Machine   
4                 user5@anon.com  Dimensionality Reduction   
...                          ...                       ...   
2753     nutcha.tavo@kmutt.ac.th       Material Processing   
2754  wannaporn.ruam@kmutt.ac.th      Ferrous Technology I   
2755  wannaporn.ruam@kmutt.ac.th      Ferrous Technology I   
2756  wannaporn.ruam@kmutt.ac.th      Ferrous Technology I   
2757  wannaporn.ruam@kmutt.ac.th      Ferrous Technology I   

                              Category  
0                  Computer Science AI  
1                  Computer Science AI  
2                  Computer Science AI  
3                  Computer Science AI  
4                  Computer Science AI  
...      

In [2]:
df_combined.to_csv('combined_output.csv', index=False)

In [6]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

# Load the anonymized CSV file
df = pd.read_csv('combined_output.csv')

# Step 1: Create a mapping from course name to a unique ID (starting from 1 for text.txt)
course_name_map = {name: idx + 1 for idx, name in enumerate(df['Course Name'].unique())}

# Step 2: Apply the mapping to the DataFrame
df['course_id_mapped'] = df['Course Name'].map(course_name_map)

# Step 3: Create user mappings to unique IDs (starting from 1 for consistency in train/val/test split)
user_name_map = {name: idx + 1 for idx, name in enumerate(df['Email'].unique())}
df['user_id_mapped'] = df['Email'].map(user_name_map)

# Step 4: Keep only the relevant columns
df = df[['user_id_mapped', 'course_id_mapped']]

# Step 5: Split the data into train, val, and test sets using GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Ensure users are not split between train/val/test
train_idx, temp_idx = next(gss.split(df, groups=df['user_id_mapped']))
train = df.iloc[train_idx]
temp = df.iloc[temp_idx]

# Split the temp set into validation and test sets, ensuring users are not split
gss_val_test = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss_val_test.split(temp, groups=temp['user_id_mapped']))
val = temp.iloc[val_idx]
test = temp.iloc[test_idx]

# Step 6: Save the train, val, and test sets as .txt files
train.to_csv('train.txt', sep=' ', header=False, index=False)
val.to_csv('val.txt', sep=' ', header=False, index=False)
test.to_csv('test.txt', sep=' ', header=False, index=False)

# Step 7: Save the course name mapping as text.txt
with open('text.txt', 'w', encoding='utf-8') as f:
    for course_name, course_id in course_name_map.items():
        f.write(f"{course_id} {course_name}\n")

print("Files saved as train.txt, val.txt, test.txt, and text.txt.")


Files saved as train.txt, val.txt, test.txt, and text.txt.


In [4]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text  # ต้องเพิ่มเพื่อรองรับ SentencepieceOp

# โหลดโมเดล MUSE จาก TensorFlow Hub
muse_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

# โหลดไฟล์ text.txt
course_codes = []
course_names = []

with open('text.txt', 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ', 1)  # แบ่งเป็น course_id กับ course_name
        if len(parts) == 2:
            course_id, course_name = parts
            course_codes.append(int(course_id) + 1)  # เพิ่ม 1 ให้ index เริ่มจาก 1
            course_names.append(course_name)

# สร้าง embeddings จาก MUSE สำหรับชื่อคอร์ส
course_embeddings = muse_model(course_names).numpy()

# บันทึกคอร์สโค้ดเป็นไฟล์ .npy
np.save('course_codes.npy', np.array(course_codes))

# บันทึก embeddings เป็นไฟล์ .npy
np.save('course_embeddings.npy', course_embeddings)

print("Files saved as course_codes.npy and course_embeddings.npy.")


Files saved as course_codes.npy and course_embeddings.npy.


In [5]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text  # ต้องเพิ่มเพื่อรองรับ SentencepieceOp

# โหลดโมเดล MUSE จาก TensorFlow Hub
muse_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

# โหลดไฟล์ text.txt สำหรับ coursera_thairobotics
course_codes = []
course_names = []

with open('text.txt', 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ', 1)  # แบ่งเป็น course_id กับ course_name
        if len(parts) == 2:
            course_id, course_name = parts
            course_codes.append(int(course_id) + 1)  # เพิ่ม 1 เพื่อให้ index เริ่มจาก 1
            course_names.append(course_name)

# สร้าง embeddings จาก MUSE สำหรับชื่อคอร์ส
course_embeddings = muse_model(course_names).numpy()

# บันทึกคอร์สโค้ดเป็นไฟล์ .npy
np.save('course_codes.npy', np.array(course_codes))

# บันทึก embeddings เป็นไฟล์ .npy
np.save('precomputed_embeddings.npy', course_embeddings)

print("Files saved as 'course_codes.npy' and 'precomputed_embeddings.npy'.")


Files saved as 'course_codes.npy' and 'precomputed_embeddings.npy'.
