In [198]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from sklearn.model_selection import train_test_split
import random

In [None]:
df = pd.read_csv("system-data.csv")

required_cols = ['userid_DI', 'course_id', 'grade', 'LoE_DI']
df = df[required_cols].dropna()

# Ensure course_id is a string
df['course_id'] = df['course_id'].astype(str)

# Convert grade to numeric and drop rows that cannot be converted
df['grade'] = pd.to_numeric(df['grade'], errors='coerce')
df.dropna(subset=['grade'], inplace=True)
df['grade'] = df['grade'].astype(np.float32)

# Encode categorical variables:
# For userid_DI, we create a new column 'user_idx'
user_encoder = LabelEncoder()
df['user_idx'] = user_encoder.fit_transform(df['userid_DI'])

# For course_id (course details), we create 'course_id_encoded'
course_encoder = LabelEncoder()
df['course_id_encoded'] = course_encoder.fit_transform(df['course_id'])

# For LoE_DI (level of education), we create 'LoE_encoded'
loe_encoder = LabelEncoder()
df['LoE_encoded'] = loe_encoder.fit_transform(df['LoE_DI'])

# Inspect the processed data
print("Number of records:", len(df))
print("Unique users (user_idx):", df['user_idx'].nunique())
print("Unique courses (course_id_encoded):", df['course_id_encoded'].nunique())
print("Unique LoE values (LoE_encoded):", df['LoE_encoded'].nunique())
print(df.head())


Number of records: 269130
Unique users (user_idx): 246504
Unique courses (course_id_encoded): 5
Unique LoE values (LoE_encoded): 5
           userid_DI                   course_id  grade      LoE_DI  user_idx  \
1883  MHxPC130470188  HarvardX/CB22x/2013_Spring    0.0  Bachelor's    194055   
1884  MHxPC130263156  HarvardX/CB22x/2013_Spring    0.0    Master's    108635   
1885  MHxPC130166676  HarvardX/CB22x/2013_Spring    0.0  Bachelor's     68845   
1888  MHxPC130378021  HarvardX/CB22x/2013_Spring    0.0  Bachelor's    156059   
1891  MHxPC130076788  HarvardX/CB22x/2013_Spring    0.0   Secondary     31704   

      course_id_encoded  LoE_encoded  
1883                  0            0  
1884                  0            3  
1885                  0            0  
1888                  0            0  
1891                  0            4  


In [201]:
# Get counts of unique values
num_users = df['user_idx'].nunique()
num_courses = df['course_id_encoded'].nunique() 
num_loe = df['LoE_encoded'].nunique()           
print(num_courses)
print(num_loe)
print(num_users)

5
5
246504


In [202]:
# Split data into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [203]:
# Hyperparameters
embedding_dim = 32
loe_embedding_dim = 4  # Smaller embedding for degree info

# Define inputs:
# - user (user_idx)
# - loe (LoE_encoded, the user's degree)
# - positive course (the course the user took)
# - negative course (a course the user did not take)
user_input = Input(shape=(1,), name='user_input')
loe_input = Input(shape=(1,), name='loe_input')
pos_course_input = Input(shape=(1,), name='pos_course_input')
neg_course_input = Input(shape=(1,), name='neg_course_input')


In [204]:
# Embedding layers
user_emb_layer = layers.Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')
loe_emb_layer = layers.Embedding(input_dim=num_loe, output_dim=loe_embedding_dim, name='loe_embedding')
course_emb_layer = layers.Embedding(input_dim=num_courses, output_dim=embedding_dim, name='course_embedding')

In [205]:
# Embed the inputs
user_emb = user_emb_layer(user_input)   # (None, 1, embedding_dim)
loe_emb = loe_emb_layer(loe_input)        # (None, 1, loe_embedding_dim)
# Flatten embeddings
user_vec = layers.Flatten()(user_emb)     # (None, embedding_dim)
loe_vec = layers.Flatten()(loe_emb)         # (None, loe_embedding_dim)

# Combine user and degree information to form a user profile
user_profile = layers.Concatenate()([user_vec, loe_vec])
# Optionally reduce dimension to match course embedding size:
user_profile = layers.Dense(embedding_dim, activation='relu')(user_profile)

In [206]:
# Get course embeddings for positive and negative courses
pos_course_emb = course_emb_layer(pos_course_input)  # (None, 1, embedding_dim)
neg_course_emb = course_emb_layer(neg_course_input)  # (None, 1, embedding_dim)
pos_course_vec = layers.Flatten()(pos_course_emb)      # (None, embedding_dim)
neg_course_vec = layers.Flatten()(neg_course_emb)      # (None, embedding_dim)

# Compute scores as dot products between the user profile and course vectors
pos_score = layers.Dot(axes=1)([user_profile, pos_course_vec])  # (None, 1)
neg_score = layers.Dot(axes=1)([user_profile, neg_course_vec])  # (None, 1)

# Output the difference in scores
score_diff = layers.Subtract()([pos_score, neg_score])  # (None, 1)

In [207]:
# Define the model
model = Model(
    inputs=[user_input, loe_input, pos_course_input, neg_course_input],
    outputs=score_diff
)

In [208]:
# --- Define BPR Loss ---
def bpr_loss(y_true, y_pred):
    epsilon = 1e-7
    return -tf.reduce_mean(tf.math.log(tf.nn.sigmoid(y_pred) + epsilon))

model.compile(optimizer='adam', loss=bpr_loss)
model.summary()

In [209]:
# --- Prepare Training Data for BPR ---
def generate_training_data(df, num_courses):
    users = []
    loe_vals = []
    pos_courses = []
    neg_courses = []
    for _, row in df.iterrows():
        user = row['user_idx']
        loe_val = row['LoE_encoded']
        pos_course = row['course_id_encoded']
        # Sample a negative course different from the positive one
        negative_options = [c for c in range(num_courses) if c != pos_course]
        neg_course = random.choice(negative_options)
        users.append(user)
        loe_vals.append(loe_val)
        pos_courses.append(pos_course)
        neg_courses.append(neg_course)
    return (np.array(users).reshape(-1,1),
            np.array(loe_vals).reshape(-1,1),
            np.array(pos_courses).reshape(-1,1),
            np.array(neg_courses).reshape(-1,1))

train_users_arr, train_loe_arr, train_pos_arr, train_neg_arr = generate_training_data(train, num_courses)
test_users_arr, test_loe_arr, test_pos_arr, test_neg_arr = generate_training_data(test, num_courses)

# For BPR loss, dummy target values are used
dummy_train = np.ones((train_users_arr.shape[0], 1))
dummy_test = np.ones((test_users_arr.shape[0], 1))

In [211]:
# --- Train the Model ---
model.fit(
    x=[train_users_arr, train_loe_arr, train_pos_arr, train_neg_arr],
    y=dummy_train,
    epochs=10,
    batch_size=32,
    validation_data=([test_users_arr, test_loe_arr, test_pos_arr, test_neg_arr], dummy_test)
)

Epoch 1/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m512s[0m 76ms/step - loss: 0.5258 - val_loss: 0.5232
Epoch 2/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m513s[0m 76ms/step - loss: 0.1808 - val_loss: 0.8552
Epoch 3/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 79ms/step - loss: 0.0423 - val_loss: 0.9545
Epoch 4/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 79ms/step - loss: 0.0224 - val_loss: 0.9937
Epoch 5/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 72ms/step - loss: 0.0160 - val_loss: 0.9748
Epoch 6/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 68ms/step - loss: 0.0123 - val_loss: 1.0761
Epoch 7/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m468s[0m 70ms/step - loss: 0.0099 - val_loss: 1.0913
Epoch 8/10
[1m6729/6729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 69ms/step - loss: 0.0090 - val_loss: 1.0929


<keras.src.callbacks.history.History at 0x11bcd9605d0>