In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Read in the data, remove the unnecessary columns, and display the data.
data = pd.read_csv('Vol3StarredData.csv')
data.drop(columns=['Unnamed: 0', 'updated_timestamp', 'prioritized'], inplace=True)
data

Unnamed: 0,user_id,concept_id,starred,mistaken,words_studied
0,3391866971749096,476,False,False,3
1,3391866971749096,2395,True,False,17
2,3391866971749096,2044,False,False,18
3,3391866971749096,1989,False,False,32
4,3391866971749096,56392,False,False,5
...,...,...,...,...,...
1993280,3839423983477931,4204,True,False,2
1993281,3839423983477931,4218,False,False,1
1993282,3839423983477931,6264,False,False,6
1993283,3839423983477931,2992,False,False,7


In [3]:
# Extract the training features and the label (using starred as the label).
X = data.drop(columns='starred')
y = data['starred']

# Generate a train-test split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
# # Create and Random Forest Classifier and fit it to the training data.
# rfc = RandomForestClassifier()
# rfc.fit(X_train, y_train)

# # Predict on the testing data.
# predicted = rfc.predict(X_test)

# # Print the classification report
# print(classification_report(y_test, predicted))

In [5]:
# Drop user id.
X.drop(columns='user_id', inplace=True)

# Generate a train-test split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Create and Random Forest Classifier and fit it to the training data.
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

# Predict on the testing data.
predicted = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, predicted))

# Predict Words that a User Should Star

In [7]:
# Select a random user and get their data.
user = np.random.choice(data['user_id'])
user_data = data.loc[data['user_id'] == user].copy()

# Set all the number of words studied to the max for each word.
num_words = np.max(user_data['words_studied'])
user_data['words_studied'] = num_words

# Get a set of all the words that the user has not interacted with.
words = set(X['concept_id'])
user_words = set(user_data['concept_id'])
other_words = words - user_words

# Create a dataframe combining the user with each new word.
word_data = pd.DataFrame(other_words, columns=['concept_id'])
word_data['user_id'] = user
word_data['mistaken'] = False
word_data['words_studied'] = num_words

# Drop all the starred words from the user data.
user_data = user_data.drop(columns='starred')

# Create a training set by concatenating the user data with the new data.
training_data = pd.concat((user_data, word_data))

training_data

Unnamed: 0,user_id,concept_id,mistaken,words_studied
105234,3529338062346510,789,False,477
105235,3529338062346510,6237,False,477
105236,3529338062346510,5105,False,477
105237,3529338062346510,2477,False,477
105238,3529338062346510,5047,False,477
...,...,...,...,...
1942,3529338062346510,16293,False,477
1943,3529338062346510,49062,False,477
1944,3529338062346510,49066,False,477
1945,3529338062346510,49067,False,477


In [8]:
# Drop the user ids and reindex from 0 to the size of the dataset.
training_data.drop(columns='user_id', inplace=True)
training_data.set_index(np.arange(len(training_data)), inplace=True)

# Pass the training data through the random forest.
labels = rfc.predict(training_data)

# Get 5 words that the forest predicted the user would star.
word_inds = np.where(labels)[0]
recommends_inds = np.random.choice(word_inds, 5)
recommendations = list(training_data.loc[recommends_inds]['concept_id'])

# Present the recommended words.
recommendations

[2707, 14328, 657, 6156, 3667]

TODO: For trial 2 pass in the vector of words the user has studied as the input.