In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Read the CSV file into a pandas dataframe
df = pd.read_csv('internship_assignment.csv')

# Convert the 'dt' column to a datetime format
df['dt'] = pd.to_datetime(df['dt'])

# Sort the data by 'user_id_hashed' and 'dt' columns in ascending order
df = df.sort_values(['user_id_hashed', 'dt'], ascending=[True, True])

# Define an instance of the OneHotEncoder class for encoding categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')

# Define a list of the categorical column names
categorical_columns = ['learning_goal', 'selected_track_id', 'selected_project_id', 'selected_project', 'topic', 'project', 'step_id', 'step', 'step_difficulty']

# Fit the OneHotEncoder to the categorical columns
encoder.fit(df[categorical_columns])

# Define a function for encoding a categorical variable using the OneHotEncoder
def encode_categorical_column(column):
    if isinstance(column, pd.Series):
        encoded_column = encoder.transform(column.values.reshape(-1,1)).toarray()
    else:
        encoded_column = np.zeros((1, encoder.categories_[0].size))
    return encoded_column

# Define a function for encoding the 'difficulty' column
def encode_difficulty(difficulty):
    if difficulty == 'easy':
        return np.array([1, 0, 0])
    elif difficulty == 'medium':
        return np.array([0, 1, 0])
    elif difficulty == 'hard':
        return np.array([0, 0, 1])
    else:
        return np.zeros((3,))


# Define a function for encoding an action row
def encode_action(row):
    # Encode the categorical variables using the OneHotEncoder
    learning_goal_encoded = encode_categorical_column(row['learning_goal'])
    selected_track_id_encoded = encode_categorical_column(row['selected_track_id'])
    selected_project_id_encoded = encode_categorical_column(row['selected_project_id'])
    selected_project_encoded = encode_categorical_column(row['selected_project'])
    topic_encoded = encode_categorical_column(row['topic'])
    project_encoded = encode_categorical_column(row['project'])
    difficulty_encoded = encode_difficulty(row['project_difficulty'])
    step_id_encoded = encode_categorical_column(row['step_id'])
    step_encoded = encode_categorical_column(row['step'])
    step_difficulty_encoded = encode_categorical_column(row['step_difficulty'])
    difficulty_encoded = difficulty_encoded.reshape(1, -1)

    # Combine the encoded features into a single vector
    encoded_action = np.concatenate((learning_goal_encoded, selected_track_id_encoded, 
                                      selected_project_id_encoded, project_encoded, 
                                      topic_encoded, difficulty_encoded, 
                                      step_id_encoded, step_encoded, step_difficulty_encoded), axis=1)

    return encoded_action


# Create a list to store the encoded sequences
sequences = []

# Group the data by user_id_hashed
for user_id, group in df.groupby('user_id_hashed'):
    # Create an empty list to store the sequence for this user
    sequence = []
    # Iterate through the rows in the group, in chronological order
    for index, row in group.sort_values('dt').iterrows():
        # Append the encoded action, user, and time to the sequence
        sequence.append((encode_action(row), row['user_id_hashed'], row['dt']))
        # If the sequence length is greater than 10, remove the oldest action
        if len(sequence) > 10:
            sequence.pop(0)
        # If the sequence length is 10, add it to the


AttributeError: 'list' object has no attribute 'reshape'

In [13]:
# Create a list of encoded action sequences
sequences = []
for index, row in actions.iterrows():
    # Encode the action using the encode_action function
    encoded_action = encode_action(row)
    # Append the encoded action, user, and time to the sequence
    sequence.append((encoded_action, row['user_id_hashed'], row['dt']))
    # If the sequence length is greater than 10, remove the oldest action
    if len(sequence) > 10:
        sequence.pop(0)
    # If the sequence is full, append it to the list of sequences
    if len(sequence) == 10:
        sequences.append(sequence.copy())
        print('Encoded sequence:', sequence)


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 5 has 1 dimension(s)

In [12]:
def create_sequences(actions):
    # Encode categorical features
    encoded_actions = actions.apply(encode_action, axis=1)
    encoded_learning_goals = encode_categorical_column(actions['learning_goal'], learning_goal_encoder)
    encoded_selected_track_ids = encode_categorical_column(actions['selected_track_id'], selected_track_id_encoder)
    encoded_selected_project_ids = encode_categorical_column(actions['selected_project_id'], selected_project_id_encoder)
    encoded_projects = encode_categorical_column(actions['project'], project_encoder)
    encoded_topics = encode_categorical_column(actions['topic'], topic_encoder)
    encoded_project_difficulties = encode_categorical_column(actions['project_difficulty'], project_difficulty_encoder)
    encoded_step_ids = encode_categorical_column(actions['step_id'], step_id_encoder)
    encoded_step_difficulties = encode_categorical_column(actions['step_difficulty'], step_difficulty_encoder)
    
    # Compute numerical features
    num_attempts = actions.groupby('user_id_hashed')['action'].count().reset_index(name='num_attempts')
    num_attempts_encoded = encode_numerical_column(num_attempts['num_attempts'])
    
    duration = actions.groupby('user_id_hashed')['duration'].sum().reset_index(name='duration')
    duration_encoded = encode_numerical_column(duration['duration'])

    # Reshape user_ids to have two dimensions
    user_ids = actions['user_id_hashed'].values.reshape(-1, 1)

    # Concatenate all features into a single array
    X = np.concatenate((encoded_actions, encoded_learning_goals, encoded_selected_track_ids, 
                        encoded_selected_project_ids, encoded_projects, encoded_topics, 
                        encoded_project_difficulties, encoded_step_ids, encoded_step_difficulties,
                        num_attempts_encoded, duration_encoded), axis=1)

    # Create empty list to hold sequences
    sequences = []
    for i, user_id in enumerate(np.unique(user_ids)):
        # Find all actions associated with the current user
        user_actions = X[user_ids.flatten() == user_id]

        # Iterate over all possible sequences of length 10
        for j in range(len(user_actions) - 10):
            sequence = user_actions[j:j+10]

            # Append the sequence and user_id to the list of sequences
            sequences.append((sequence, user_id))

    return sequences


