In [1]:
import numpy as np
import pandas as pd
from load_data import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

In [2]:
# Read in the data, remove the unnecessary columns, and display the data.
data = pd.read_csv('Vol3StarredData.csv')
data['important'] = data['starred'] + data['mistaken']
data.drop(columns=['Unnamed: 0', 'updated_timestamp', 'prioritized', 'starred', 'mistaken'], inplace=True)
data



Unnamed: 0,user_id,concept_id,words_studied,important
0,3391866971749096,476,3,False
1,3391866971749096,2395,17,True
2,3391866971749096,2044,18,False
3,3391866971749096,1989,32,False
4,3391866971749096,56392,5,False
...,...,...,...,...
1993280,3839423983477931,4204,2,True
1993281,3839423983477931,4218,1,False
1993282,3839423983477931,6264,6,False
1993283,3839423983477931,2992,7,False


In [3]:
# Get a train/test split with the data.
training_data, testing_data = train_test_split(test_size=500, X=data)

# Extract the training features and the label (using starred as the label).
X_train = training_data.drop(columns='important')
y_train = training_data['important']

# Drop user id.
X_train.drop(columns='user_id', inplace=True)

In [4]:
# Create and Random Forest Classifier and fit it to the training data.
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

# Predict Words that a User Should Star

In [5]:
def generate_training_data(test_user):

    # Extract the input and testing data.
    user_data, user_test = test_user
    user = user_data['user_id'].iloc[0]

    # Set all the number of words studied to the max for each word.
    num_words = np.max(user_data['words_studied'])
    user_data['words_studied'] = num_words

    # Get a set of all the words that the user has not interacted with.
    user_words = set(user_data['concept_id'])
    other_words = words - user_words

    # Create a dataframe combining the user with each new word.
    word_data = pd.DataFrame(other_words, columns=['concept_id'])
    word_data['user_id'] = user
    word_data['words_studied'] = num_words

    # Drop all the important words from the user data.
    user_data = user_data.drop(columns='important')

    # Create a training set by concatenating the user data with the new data.
    training_data = pd.concat((user_data, word_data))
    return training_data, list(user_test['concept_id'])

In [6]:
def make_recommendations(training_data, num=None):
    
    # Drop the user ids and reindex from 0 to the size of the dataset.
    training_data.drop(columns='user_id', inplace=True)
    training_data.set_index(np.arange(len(training_data)), inplace=True)

    # Pass the training data through the random forest.
    labels = rfc.predict(training_data)

    # Get the indicies of the words.
    word_inds = np.where(labels)[0]

    # Make num random recommendations, or return all recommendations is num is None.
    if num is not None:
        recommends_inds = np.random.choice(word_inds, 5)
        recommendations = list(training_data.loc[recommends_inds]['concept_id'])
    else:
        recommendations = list(training_data.loc[word_inds]['concept_id'])

    # Present the recommended words.
    return recommendations

In [7]:
def score(future_words, recommendations, return_correct=False):
    # Set a counter for the correct words.
    num_correct = 0
    correct_words = []

    # For each word:
    for word in recommendations:

        # If the word was a future word studied, mark it as a correct prediction.
        if word in future_words:
            num_correct += 1
            correct_words.append(word)


    # Return the average number of correct predictions.
    accuracy = float(num_correct)/(len(recommendations))

    if return_correct:
        return accuracy, correct_words
    else:
        return accuracy

In [8]:
# Get the set of all words.
words = set(data['concept_id'])

# Set needed variables.
epochs = 10
scores = []

# For each epoch:
for i in range(epochs):

    # Generate and score predictions for each user.
    for test_user in testing_data:
        if not test_user[0].empty:
            training_data, future_words = generate_training_data(test_user)
            recommendations = make_recommendations(training_data, 5)        # Make 5 random predictions.
            scores.append(score(future_words, recommendations))

# Print the average score.
print('Score:', np.average(scores))

Score: 0.05546184738955823


In [12]:
# Get the set of all words.
words = set(data['concept_id'])

# Set needed variables.
scores = []
correct_words = []

# Generate and score predictions for each user (note, no epochs needed because the we are getting all the predictions).
for test_user in testing_data:
    if not test_user[0].empty:
        training_data, future_words = generate_training_data(test_user)
        recommendations = make_recommendations(training_data)           # Get all predictions.
        new_score, new_correct = score(future_words, recommendations, True)
        scores.append(new_score)
        correct_words.append(new_correct)

# Get the set of words predicted correctly.
correct_words_dict = {}
for words in correct_words:
    for word in words:
        if word not in correct_words_dict.keys():
            correct_words_dict[word] = 1
        else:
            correct_words_dict[word] += 1

# Print the average score and correctly predicted words.
print('Score:\t\t', np.average(scores))
print('Words Predicted Correctly:\t', correct_words_dict)

Score:		 0.05537326471635413
Words Predicted Correctly:	 {425: 2, 131: 6, 299: 7, 429: 2, 514: 6, 841: 2, 1144: 5, 1614: 7, 1852: 3, 2171: 2, 2535: 3, 2716: 4, 3135: 2, 3136: 2, 3461: 4, 3657: 7, 3678: 5, 3692: 5, 3735: 4, 4212: 2, 5042: 5, 5136: 6, 5261: 4, 5347: 4, 5615: 3, 5620: 6, 5819: 8, 5887: 9, 14328: 14, 6214: 4, 6234: 5, 6304: 2, 433: 2, 2484: 1, 3452: 2, 3597: 1, 735: 7, 134: 9, 360: 2, 1345: 3, 1348: 3, 1380: 1, 2221: 2, 2229: 2, 2583: 4, 3310: 1, 3332: 1, 3339: 2, 11737: 2, 4058: 2, 4064: 1, 4065: 3, 4160: 3, 4168: 5, 4330: 3, 4337: 1, 4688: 3, 5033: 2, 5048: 4, 5186: 1, 5224: 6, 5247: 4, 5309: 10, 5340: 1, 5359: 1, 13749: 2, 13748: 1, 13873: 3, 5748: 2, 6085: 1, 6092: 1, 6093: 2, 6157: 2, 6159: 1, 6170: 2, 87: 1, 550: 2, 2766: 2, 5841: 5, 15522: 4, 201: 2, 230: 2, 1120: 3, 1367: 3, 1573: 4, 1803: 2, 2385: 1, 2515: 7, 3239: 1, 3422: 4, 3514: 3, 3710: 1, 3900: 2, 4725: 6, 4788: 3, 4789: 3, 4908: 1, 4943: 1, 5039: 3, 5165: 6, 5232: 6, 5379: 6, 5511: 1, 5665: 3, 5667: 3, 5830