In [3]:
import pandas as pd

# อ่านไฟล์ combined_output.csv
df = pd.read_csv('combined_output.csv')

# ฟังก์ชันเพื่อสร้าง anonymous name และ email
def anonymize_data(df):
    unique_pairs = {}
    counter = 1

    # Loop ผ่านข้อมูลทุกแถว
    for index, row in df.iterrows():
        name_email_pair = (row['Name'], row['Email'])
        
        if name_email_pair not in unique_pairs:
            # สร้างค่า anonymous ใหม่สำหรับชื่อและอีเมล์
            unique_pairs[name_email_pair] = (f'User{counter}', f'user{counter}@anon.com')
            counter += 1
        
        # แทนที่ชื่อและอีเมล์ด้วย anonymous ที่สร้าง
        df.at[index, 'Name'], df.at[index, 'Email'] = unique_pairs[name_email_pair]
    
    return df

# เรียกใช้ฟังก์ชันเพื่อทำการ anonymize
df_anonymized = anonymize_data(df)

# บันทึกไฟล์ที่ทำการ anonymize แล้ว
df_anonymized.to_csv('anonymized_combined_output.csv', index=False)

print("Anonymized data saved to 'anonymized_combined_output.csv'")


Anonymized data saved to 'anonymized_combined_output.csv'


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the anonymized CSV file
df = pd.read_csv('anonymized_combined_output.csv')

# Step 1: Create a mapping from course name to a unique ID (for text.txt)
course_name_map = {name: idx for idx, name in enumerate(df['Course Name'].unique())}

# Step 2: Apply the mapping to the DataFrame
df['course_id_mapped'] = df['Course Name'].map(course_name_map)

# Step 3: Create user mappings to unique IDs (for consistency in train/test/val split)
user_name_map = {name: idx for idx, name in enumerate(df['Name'].unique())}
df['user_id_mapped'] = df['Name'].map(user_name_map)

# Step 4: Keep only the relevant columns
df = df[['user_id_mapped', 'course_id_mapped', 'Progress']]

# Convert 'Progress' to binary (1 for progress > 0, 0 otherwise)
df['Progress'] = df['Progress'].apply(lambda x: 1 if int(x.strip('%')) > 0 else 0)

# Step 5: Split the data into train, val, and test sets
train, temp = train_test_split(df, test_size=0.3, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

# Step 6: Save the train, val, and test sets as .txt files
train.to_csv('train.txt', sep=' ', header=False, index=False)
val.to_csv('val.txt', sep=' ', header=False, index=False)
test.to_csv('test.txt', sep=' ', header=False, index=False)

# Step 7: Save the course name mapping as text.txt
# Step 7: Save the course name mapping as text.txt
with open('text.txt', 'w', encoding='utf-8') as f:
    for course_name, course_id in course_name_map.items():
        f.write(f"{course_id} {course_name}\n")


print("Files saved as train.txt, val.txt, test.txt, and text.txt.")


Files saved as train.txt, val.txt, test.txt, and text.txt.


In [10]:
!pip install tensorflow-text




In [13]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text  # ต้องเพิ่มเพื่อรองรับ SentencepieceOp

# โหลดโมเดล MUSE จาก TensorFlow Hub
muse_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

# โหลดไฟล์ text.txt
course_codes = []
course_names = []

with open('text.txt', 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ', 1)  # แบ่งเป็น course_id กับ course_name
        if len(parts) == 2:
            course_id, course_name = parts
            course_codes.append(int(course_id))
            course_names.append(course_name)

# สร้าง embeddings จาก MUSE สำหรับชื่อคอร์ส
course_embeddings = muse_model(course_names).numpy()

# บันทึกคอร์สโค้ดเป็นไฟล์ .npy
np.save('course_codes.npy', np.array(course_codes))

# บันทึก embeddings เป็นไฟล์ .npy
np.save('course_embeddings.npy', course_embeddings)

print("Files saved as course_codes.npy and course_embeddings.npy.")


Files saved as course_codes.npy and course_embeddings.npy.


In [18]:
pip install lxml


Collecting lxml
  Downloading lxml-5.3.0-cp39-cp39-win_amd64.whl.metadata (3.9 kB)
Downloading lxml-5.3.0-cp39-cp39-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ------------------------ --------------- 2.4/3.8 MB 12.2 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 11.9 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.3.0
Note: you may need to restart the kernel to use updated packages.


In [28]:
from bs4 import BeautifulSoup
import os
import json

# Function to extract course details
def extract_course_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    courses = []
    
    # Extract all course rows by identifying 'tr' tags with 'post-' ID pattern
    course_rows = soup.find_all('tr', id=lambda x: x and x.startswith('post-'))
    
    for row in course_rows:
        course = {}
        
        # Extract the course name
        course_name_tag = row.find('strong').find('a')
        course['course_name'] = course_name_tag.text.strip() if course_name_tag else "Unknown"
        
        # Extract the course category
        category_tag = row.find('td', class_='taxonomy-stm_lms_course_taxonomy')
        course['course_category'] = category_tag.text.strip() if category_tag else "Unknown"
        
        # Extract course post ID
        course['post_id'] = row.get('id', 'Unknown').replace('post-', '')
        
        # Extract course status
        course['status'] = row.get('class')[2] if len(row.get('class')) > 2 else "Unknown"
        
        # Add the course details to the list
        courses.append(course)
    
    return courses

# Function to process multiple HTML files
def process_html_files(file_list):
    all_courses = []
    
    for file_path in file_list:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            course_details = extract_course_details(html_content)
            all_courses.extend(course_details)
    
    return all_courses

# List of the HTML files you provided
html_files = [
    'Courses ‹ Robotic Learning Hub — WordPress.html',
    'Courses ‹ Robotic Learning Hub — WordPress2.html',
    'Courses ‹ Robotic Learning Hub — WordPress.html'
]

# Process all HTML files and gather course details
courses_data = process_html_files(html_files)

# Save the extracted data to a JSON file
with open('courses_data.json', 'w', encoding='utf-8') as f:
    json.dump(courses_data, f, ensure_ascii=False, indent=4)

print("Courses data has been saved to courses_data.json.")


Courses data has been saved to courses_data.json.


In [30]:
import pandas as pd
import json

# Load the course data from the JSON file
with open('courses_data.json', 'r', encoding='utf-8') as json_file:
    course_data = json.load(json_file)

# Load your anonymized_combined_output.csv
df = pd.read_csv('anonymized_combined_output.csv')

# Add a 'Category' column by mapping 'Course Name' to the categories from the JSON file
# The JSON file structure is expected to have 'course_name' as keys and 'course_category' as values
course_mapping = {course['course_name']: course['course_category'] for course in course_data}
df['Category'] = df['Course Name'].map(course_mapping)

# Save the updated DataFrame to a new CSV
df.to_csv('anonymized_combined_output_with_categories.csv', index=False)

print("Categories added and saved to anonymized_combined_output_with_categories.csv")


Categories added and saved to anonymized_combined_output_with_categories.csv
