In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Read the CSV file into a pandas dataframe
df = pd.read_csv('internship_assignment.csv')

# Convert the 'dt' column to a datetime format
df['dt'] = pd.to_datetime(df['dt'])

# Extract features from 'dt' column
df['day_of_week'] = df['dt'].dt.dayofweek
df['hour_of_day'] = df['dt'].dt.hour

# Sort the data by 'user_id_hashed' and 'dt' columns in ascending order
df = df.sort_values(['user_id_hashed', 'dt'], ascending=[True, True])

# Define a list of the categorical column names
categorical_columns = ['learning_goal', 'selected_project', 'topic', 'project', 'project_difficulty', 'step', 'step_difficulty']

# Define an instance of the OneHotEncoder class for encoding categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')

# Fit and transform the categorical columns in one step
encoded_categorical_data = encoder.fit_transform(df[categorical_columns])

# Combine the non-categorical columns with the encoded categorical data
non_categorical_columns = ['user_id_hashed', 'day_of_week', 'hour_of_day']
encoded_data = pd.concat([df[non_categorical_columns], pd.DataFrame(encoded_categorical_data.toarray())], axis=1)

# One-hot encode the target variable 'action'
target_encoder = OneHotEncoder(handle_unknown='ignore')
encoded_target = target_encoder.fit_transform(df[['action']])


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical


# Normalize the non-categorical columns using MinMaxScaler
scaler = MinMaxScaler()
encoded_data[['day_of_week', 'hour_of_day']] = scaler.fit_transform(encoded_data[['day_of_week', 'hour_of_day']])

# Split the data into training and testing sets (80% train, 20% test)


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(encoded_data, encoded_target, test_size=0.2, random_state=42)

# Convert the sparse matrix to a dense NumPy array and get the index of the non-zero element
y_train = y_train.toarray().argmax(axis=1)
y_test = y_test.toarray().argmax(axis=1)

# Convert the target variables to one-hot encoded format
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)



In [7]:
n_features = X_train.shape[1]
n_actions = y_train.shape[1]

model = Sequential()
model.add(LSTM(128, activation='relu', input_shape=(1, n_features), return_sequences=True))
model.add(LSTM(64, activation='relu'))
model.add(Dense(n_actions, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [8]:
# Reshape the input data to be 3D, as required by the LSTM model (samples, timesteps, features)
X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=10, validation_data=(X_test_reshaped, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")


Test accuracy: 26.43%


In [61]:
def predict_next_actions(model, user_data, n_actions):
    # Pad the input data with an additional feature of zeros
    user_data_padded = np.pad(user_data.values, ((0, 0), (1, 0)), 'constant')
    user_data_reshaped = user_data_padded.reshape((1, 1, user_data_padded.shape[1]))
    predictions = []

    for _ in range(n_actions):
        prediction = model.predict(user_data_reshaped)
        predictions.append(prediction)
        user_data_reshaped = np.concatenate((user_data_reshaped[:, -1:, :], prediction), axis=1).reshape((1, user_data_reshaped.shape[1], user_data_reshaped.shape[2]))

    return np.array(predictions)



In [69]:
def predict_next_actions(model, user_data, n_actions):
    padding_length = model.input_shape[2] - user_data.shape[1]
    user_data_padded = np.pad(user_data.values, ((0, 0), (padding_length, 0)), 'constant')
    user_data_reshaped = user_data_padded.reshape((1, 1, user_data_padded.shape[1]))
    predictions = []

    for _ in range(n_actions):
        prediction = model.predict(user_data_reshaped)
        predictions.append(prediction)
        prediction = prediction.reshape(1, 1, -1)  # Add an extra dimension to the prediction array
        user_data_reshaped = np.concatenate((user_data_reshaped[:, :, :-prediction.shape[2]], prediction), axis=2)

    return np.array(predictions)




In [87]:
user_id = 8954549735489947820
user_data = encoded_data[encoded_data['user_id_hashed'] == user_id].iloc[-1:].drop(columns=['user_id_hashed'])

# Predict the next 5 actions for the given user
predicted_actions_encoded = predict_next_actions(model, user_data, 50)

# Decode the predicted actions
predicted_actions = target_encoder.inverse_transform(predicted_actions_encoded.reshape(predicted_actions_encoded.shape[0], -1))
print(f"Next 5 actions for user {user_id}: {predicted_actions}")


Next 5 actions for user 8954549735489947820: [['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_submission']
 ['failed_su

In [88]:
encoded_data.tail()


num_unique = encoded_data['user_id_hashed'].nunique()
print(f"Number of unique entries in 'column_name': {num_unique}")


Number of unique entries in 'column_name': 14
