In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Read the CSV file into a pandas dataframe
df = pd.read_csv('internship_assignment.csv')

# Convert the 'dt' column to a datetime format
df['dt'] = pd.to_datetime(df['dt'])

# Sort the data by 'user_id_hashed' and 'dt' columns in ascending order
df = df.sort_values(['user_id_hashed', 'dt'], ascending=[True, True])

# Define an instance of the OneHotEncoder class for encoding categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')

# Define a list of the categorical column names
categorical_columns = ['learning_goal', 'selected_track_id', 'selected_project_id', 'selected_project', 'topic', 'project', 'step_id', 'step', 'step_difficulty']

# Fit the OneHotEncoder to the categorical columns
encoder.fit(df[categorical_columns])

# Define a function for encoding a categorical variable using the OneHotEncoder
def encode_categorical_column(column):
    if isinstance(column, pd.Series):
        encoded_column = encoder.transform(column.values.reshape(-1,1)).toarray()
    else:
        encoded_column = np.zeros((1, encoder.categories_[0].size))
    return encoded_column

# Define a function for encoding the 'difficulty' column
def encode_difficulty(difficulty):
    if difficulty == 'easy':
        return np.array([1, 0, 0])
    elif difficulty == 'medium':
        return np.array([0, 1, 0])
    elif difficulty == 'hard':
        return np.array([0, 0, 1])
    else:
        return np.zeros((3,))


# Define a function for encoding an action row
def encode_action(row):
    # Encode the categorical variables using the OneHotEncoder
    
    selected_track_id_encoded = encode_categorical_column(row['selected_track_id'])
    selected_project_id_encoded = encode_categorical_column(row['selected_project_id'])
    
    topic_encoded = encode_categorical_column(row['topic'])
    project_difficulty_encoded = encode_categorical_column(row['project_difficulty'])
    step_id_encoded = encode_categorical_column(row['step_id'])
    step_difficulty_encoded = encode_categorical_column(row['step_difficulty'])
    

    # Combine the encoded features into a single vector
    encoded_action = np.concatenate(( selected_track_id_encoded, 
                                      selected_project_id_encoded, 
                                      topic_encoded, project_difficulty_encoded, 
                                      step_id_encoded, step_difficulty_encoded), axis=1)

    return encoded_action


In [25]:
# Define a list to hold the sequences of actions
sequences = []

# Loop through each user
for user_id, user_actions in df.groupby('user_id_hashed'):
    # Sort the user's actions by time
    user_actions = user_actions.sort_values('dt')
    # Define a list to hold the sequence of actions for this user
    sequence = []
    # Loop through each action for this user
    for index, row in user_actions.iterrows():
        # Append the encoded action, user, and time to the sequence
        sequence.append((encode_action(row), row['user_id_hashed'], row['dt']))
        # If the sequence length is greater than 10, remove the oldest action
        if len(sequence) > 10:
            sequence.pop(0)
        # If the sequence length is 10, add it to the list of sequences
        if len(sequence) == 10:
            sequences.append(sequence.copy())


In [46]:
def generate_training_data(sequences):
    X = []
    y = []
    for sequence in sequences:
        if len(sequence) < 15:
            continue
        for i in range(len(sequence)-14):
            input_sequence = [item[0] for item in sequence[i:i+10]]
            target_sequence = [item[0] for item in sequence[i+10:i+15]]
            X.append(input_sequence)
            y.append(target_sequence)
    return np.array(X), np.array(y)


In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense, LSTM, TimeDistributed

# ... (Data preprocessing and encoding code remains the same)

# Generate training data
X, y = generate_training_data(sequences)

# Modify the model architecture
model = Sequential()
model.add(LSTM(64, input_shape=X.shape[1:], return_sequences=True))
model.add(TimeDistributed(Dense(32, activation='relu')))
model.add(TimeDistributed(Dense(y.shape[2], activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model
model.fit(X, y, batch_size=32, epochs=10)


ValueError: Input 0 of layer "lstm_1" is incompatible with the layer: expected ndim=3, found ndim=1. Full shape received: (None,)

In [37]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GRU
from tensorflow.keras.models import Model

# Convert the sequences of actions to input and output data for the model
num_features = sequences[0][0][0].shape[1]
encoder_input_data = np.zeros((len(sequences), 15, num_features))

decoder_input_data = np.zeros((len(sequences), 5, 379))
decoder_target_data = np.zeros((len(sequences), 5, 379))
for i, sequence in enumerate(sequences):
    for j, (encoded_action, user_id, dt) in enumerate(sequence[:10]):
        encoder_input_data[i, j, :] = encoded_action
    for j, (encoded_action, user_id, dt) in enumerate(sequence[10:15]):
        if j == 0:
            decoder_input_data[i, j, :] = encoded_action
        else:
            decoder_input_data[i, j, :] = encoded_action
            decoder_target_data[i, j-1, :] = encoded_action

# Define the input shape for the model
encoder_inputs = Input(shape=(None, 379))
decoder_inputs = Input(shape=(None, 379))

# Define the GRU layer for the encoder
encoder_gru = GRU(256, return_state=True)

# Encode the input sequence using the GRU layer
_, state_h = encoder_gru(encoder_inputs)

# Define the GRU layer for the decoder
decoder_gru = GRU(256, return_sequences=True, return_state=True)

# Decode the output sequence using the decoder GRU layer
decoder_outputs, _ = decoder_gru(decoder_inputs, initial_state=state_h)

# Define the output layer for the decoder
decoder_dense = Dense(379, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model on the sequences of actions
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=32, epochs=10)


2023-03-31 15:23:09.082878: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 15:23:09.084282: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 15:23:09.085790: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/10


ValueError: in user code:

    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_3" is incompatible with the layer: expected shape=(None, None, 379), found shape=(None, 15, 114)


In [30]:
# Define the maximum sequence length
max_seq_length = 10

# Initialize the encoder input, decoder input, and decoder target arrays
encoder_input_data = np.zeros((len(sequences), max_seq_length, 114))
decoder_input_data = np.zeros((len(sequences), max_seq_length-1, 114))
decoder_target_data = np.zeros((len(sequences), max_seq_length, 100))

# Loop through each sequence
for i, sequence in enumerate(sequences):
    # Split the sequence into the encoder input, decoder input, and decoder target
    encoder_input = np.array([action[0] for action in sequence[:-1]])
    decoder_input = np.array([action[0] for action in sequence[1:]])
    decoder_target = np.zeros((max_seq_length, 1, 114))
    decoder_target[1:len(decoder_input)+1, :, :] = decoder_input
    
    # Add the encoder input, decoder input, and decoder target to the arrays
    encoder_input_data[i] = encoder_input
    decoder_input_data[i] = decoder_input
    decoder_target_data[i] = np.array([encode_categorical_column(action[0]) for action in sequence] + [np.zeros((114,)) for _ in range(max_seq_length - len(sequence))])



ValueError: could not broadcast input array from shape (9,1,114) into shape (10,114)

In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GRU
from tensorflow.keras.models import Model


# Define the input shape for the encoder
encoder_inputs = Input(shape=(None, 100))

# Define the GRU layer for the encoder
encoder_gru = GRU(128, return_state=True)

# Run the encoder with the input sequence
_, state_h = encoder_gru(encoder_inputs)

# Define the input shape for the decoder
decoder_inputs = Input(shape=(None, 100))

# Define the GRU layer for the decoder
decoder_gru = GRU(128, return_sequences=True, return_state=True)

# Run the decoder with the initial state from the encoder and the decoder inputs
decoder_outputs, _ = decoder_gru(decoder_inputs, initial_state=state_h)

# Define the output layer for the decoder
decoder_dense = Dense(100, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model with the encoder and decoder inputs and outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model with the categorical cross-entropy loss and the Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model on the sequences of actions
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=32, epochs=10)



2023-03-31 15:18:46.260907: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 15:18:46.263533: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 15:18:46.264942: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/10


ValueError: in user code:

    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/simeongluzman/Library/Python/3.9/lib/python/site-packages/keras/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_2" is incompatible with the layer: expected shape=(None, None, 100), found shape=(None, 10, 114)


In [45]:
import tensorflow as tf
from keras.layers import Input, Dense, GRU
from keras.models import Model

from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

num_users = len(df['user_id_hashed'].unique())
num_features = len(tokenizer.word_index) + 1
max_seq_length = 15

# Convert the sequences of actions to input and output data for the model
encoder_input_data = np.zeros((len(sequences), max_seq_length, num_features))
decoder_input_data = np.zeros((len(sequences), 5, len(tokenizer.word_index) + 1))
decoder_target_data = np.zeros((len(sequences), 5, len(tokenizer.word_index) + 1))

for i, sequence in enumerate(sequences):
    # Encode the actions in the sequence
    encoded_sequence = np.array([encode_categorical_column(action[0]) for action in sequence])

    # Pad the sequence with zeros up to the maximum sequence length
    encoded_sequence = pad_sequences([encoded_sequence], maxlen=max_seq_length, dtype='float32', padding='post')[0]

    # Split the sequence into the encoder and decoder inputs
    encoder_input = encoded_sequence[:max_seq_length, :]
    decoder_input = np.zeros((5, len(tokenizer.word_index) + 1))

    for j, (encoded_action, user_id, dt) in enumerate(sequence[:max_seq_length]):
        encoder_input[j, :] = encoded_action

    for j, (encoded_action, user_id, dt) in enumerate(sequence[max_seq_length:max_seq_length+5]):
        if j == 0:
            decoder_input[j, tokenizer.word_index['START']] = 1
        else:
            decoder_input[j, :] = decoder_input[j-1, :]
            decoder_input[j, tokenizer.word_index[encoded_action]] = 1

    # Set the decoder target to be the decoder input shifted by one time step
    decoder_target = np.roll(decoder_input, -1, axis=0)
    decoder_target[-1, :] = np.zeros((len(tokenizer.word_index) + 1,))
    decoder_target_data[i] = decoder_target

    # Add the encoder input, decoder input, and decoder target to the arrays
    encoder_input_data[i] = encoder_input
    decoder_input_data[i] = decoder_input

# Define the model architecture
encoder_inputs = keras.Input(shape=(None, num_features))
encoder = keras.layers.GRU(latent_dim, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)
decoder_inputs = keras.Input(shape=(None, len(tokenizer.word_index) + 1))
decoder_gru = keras.layers.GRU(latent_dim, return_sequences=True)
decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
decoder_dense = keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=32, epochs=10)



ValueError: could not broadcast input array from shape (1,114) into shape (1,19)