In [1]:
import numpy as np
import pandas as pd
import data_cleaning
from scipy import sparse
from load_data import *
import time

In [30]:
#Get the data created by the data cleaning script
# List of words encountered by the user on the app.  If a word isn't listed for a certain user,
# then that user hasn't yet encountered that word in the app.
database = pd.read_csv("Vol3CleanedData.csv", index_col = 0)
# database, but limited based on if a user has starred at least one word.
users_who_star = pd.read_csv("Vol3StarredData.csv", index_col=0)
# Cast boolean columns as integers
users_who_star['starred'] = users_who_star['starred'].astype('int')
users_who_star['mistaken'] = users_who_star['mistaken'].astype('int')

display(users_who_star.sample(10))

  mask |= (ar1 == a)


Unnamed: 0,user_id,concept_id,prioritized,updated_timestamp,starred,mistaken,words_studied
314379,3526582993919929,4563,0,1574452735189,0,0,508
4614448,3685663292268160,4032,2,1619198345660,1,0,736
2000669,3597330111278142,1968,0,1611811146354,0,0,142
4600483,3684408494931459,3998,0,1606009155929,0,0,61
869571,3561325693434876,358,0,1595094766005,0,0,597
2179535,3600827686694396,1984,0,1622646771303,0,0,39
3517916,3632697689483654,5207,0,1588453866929,0,0,539
2289971,3602810761377396,3969,2,1599331527222,1,0,427
792021,3557000681978879,5915,0,1599744826181,0,0,24
5142041,3735379835132428,5049,0,1618424410279,0,0,626


### Trial 1: Naive implementation of encoding

In [70]:
start = time.time()
#One-hot encode the words to get a sparse matrix, and combine based on user_id
df = users_who_star.copy()
# starred = df['starred'].to_numpy()
# mistaken = df['mistaken'].to_numpy()
# words.drop(columns=['updated_timestamp','words_studied','prioritized','starred','mistaken'], inplace=True)
# print(len(words['concept_id'].unique()))
# indices = words['user_id']

column_labels = df['concept_id'].unique()
row_labels = df['user_id'].unique()
#Initialize Sparse Matrix
# Layer learned is a T/F (0/1) indicator if the word has been learned at all
# Layer starred is a T/F (0/1) indicator if the word has been starred or mistaken
row_index = []
col_index = []
learned_data = []
starred_data = []
for index, row in df.iterrows():
    i = np.where(row_labels == row['user_id'])[0][0]
    j = np.where(column_labels == row['concept_id'])[0][0]
    #Add indices and data to lists to construct COO Sparse matrix
    row_index.append(i)
    col_index.append(j)
    learned_data.append(1)
    starred_data.append(max((row['starred'], row['mistaken'])))

learned = sparse.coo_matrix((learned_data,(row_index,col_index)))
starred = sparse.coo_matrix((starred_data,(row_index,col_index)))

print(learned.shape)
print("Time:", time.time()-start,"seconds")
### Forgive me!  I tried to do it in a smart way, but I couldn't get it to work.
### As such, I defaulted to the Naive way.  If someone wants to attempt, 
### Commented throughout this cell are various attempts and functions that might
### prove useful.
    
# list_words = words.groupby('user_id')['concept_id'].apply(list)
# df = pd.DataFrame(list_words)
# data3 = df['user_id'].apply(collections.Counter)
# pd.DataFrame.from_records(data3).fillna(value=0)
# words = words.groupby('user_id').sum()

(6085, 2424)
Time: 87.22972822189331


### Trial 2: NumPy Implementation Encoding

In [31]:
start = time.time()
#One-hot encode the words to get a sparse matrix, and combine based on user_id
df = users_who_star.copy()
# starred = df['starred'].to_numpy()
# mistaken = df['mistaken'].to_numpy()
# words.drop(columns=['updated_timestamp','words_studied','prioritized','starred','mistaken'], inplace=True)
# print(len(words['concept_id'].unique()))
# indices = words['user_id']

column_labels = df['concept_id'].unique()
row_labels = df['user_id'].unique()
#Initialize Matrix
# Layer 0 is a T/F (0/1) indicator if the word has been learned at all
# Layer 1 is a T/F (0/1) indicator if the word has been starred or mistaken
data = np.zeros((len(row_labels),len(column_labels),2))
for index, row in df.iterrows():
    i = np.where(row_labels == row['user_id'])[0][0]
    j = np.where(column_labels == row['concept_id'])[0][0]
    #Add indices and data to lists to construct matrix
    data[i,j,0] = 1
    data[i,j,1] = max((row['starred'], row['mistaken']))

print(data.shape)
print("Time:", time.time()-start,"seconds")
### Forgive me!  I tried to do it in a smart way, but I couldn't get it to work.
### As such, I defaulted to the Naive way.  If someone wants to attempt, 
### Commented throughout this cell are various attempts and functions that might
### prove useful.
    
# list_words = words.groupby('user_id')['concept_id'].apply(list)
# df = pd.DataFrame(list_words)
# data3 = df['user_id'].apply(collections.Counter)
# pd.DataFrame.from_records(data3).fillna(value=0)
# words = words.groupby('user_id').sum()

(6085, 2424, 2)
Time: 81.62009239196777 seconds


In [35]:
def train_test_split(X, test_size=.2):
    n = X.shape[0]
    test_size_n = int(test_size*n)
    test_indices = np.random.choice(n,test_size_n,replace=False)
    X_train = np.delete(X, test_indices, 0)
    test_users = X[test_indices,:,:]
    #We must selectively identify words to exclude from the learned words set, in order to
    #be able to usefully judge whether suggested words were actually starred.
    #Must also find balance between users who starred few words versus users who star all words
    for i in range(len(test_users)):
        starred = test_users[i,:,1]
        learned = test_users[i,:,0]
        num_starred = float(np.count_nonzero(starred))
        ratio = num_starred/np.count_nonzero(learned)
        test_rate = .75 if ratio >= .75 else 1
        test_indices = np.random.choice(len(starred),int(test_rate*num_starred),replace=False, p=starred/num_starred)
        #Delete selected words from learned vocabulary,
        #Test to see if predictions include these words
        test_users[i,test_indices,0] = 0
        test_users[i,:,1] = 0
        test_users[i,test_indices,1] = 1
        
    return X_train, test_users[:,:,0], test_users[:,:,1]


In [5]:
#Use that matrix to determine how close a new datapoint is, and suggest words that are in the set-minus

# Another possible way (higher computational complexity):
# Make lists of each user with words in order, to find new words that were learned later (excludes all previous words)
# Use percentage of the list matches with each user to determine "closeness"

class KNNPredictor():
    '''
    Predictor to predict what data points are closest (see predict function for algorithm),
    and suggests new words that are in the set subtraction
    
    Note that this KNN is unconventional, as it predicts values for all other features based on the inputted features.
    '''
    def __init__(self,k,column_labels):
        self.column_labels = column_labels
        self.k=k
    
    def fit(self,X):
        '''
        Params:
            X (ndarray(m,n,2)): matrix encoded with all possible words from all data points.  
        '''
        #Add data to the class
        self.X = X
        return self
    
    def predict(self,x,num_suggest=5, random_sample = False):
        '''
        Predicts which words will be starred or mistaken in the future.  
        Params:
            x (ndarray(k,l,2)): (possibly batch) ndarray encoded with all possible words of samples to predict on.  
        '''
        # Subtract row x from all the rows of self.X, and count the number of -1's.
        # The argmin of counting the -1's is the closest to x
        # e.g. -1's when x has a word that X[i] doesn't, meaning they are less similar.
        # Then use the remaining +1's to pick a word to suggest
        # e.g. 1's when X[i] has a word that x doesn't.
        # If choosing k>1 neighbors, sum up the columns and pick a random word, weighted based on frequency
        samples = x.copy()
        predictions = np.zeros((len(samples),num_suggest))
        for i, row in enumerate(samples):
            #Use algorithm above to determine 'closeness'
            distances = self.X[:,:,0] - row
            negative_mask = distances < 0
            closeness = np.count_nonzero(negative_mask, axis=1)
            #Find indices of the k closest users
            indices = closeness.argsort()[:self.k]
            
            #Construct probability distribution and sample without replacement
            #Suggest only words that the user in question has not already learned
            possible_suggest = self.X[indices,:,1] - row
            possible_suggest = (possible_suggest > 0).astype(int)
            distribution = possible_suggest.sum(axis=0)
            distribution = distribution/np.sum(distribution)
            if random_sample:
                #If random is wanted, samples are drawn from a multinomial distribution
                suggest = np.random.choice(len(distribution),num_suggest,False,distribution)
            else:
                #If not random, then the n highest frequency words are suggested.
                suggest = (-distribution).argsort()[:num_suggest]
            
            #Insert predicted values to output matrix
            predictions[i,:] = self.column_labels[suggest]
            
        return predictions
    
    def score(self,y_true, pred):
        # This score is used to verify if the predicted words are in the set of actually starred words.
        # Returns number of correctly identified over number of total suggested
        # NOTE: This is perhaps an unreliable score.  It is possible that a word suggested to a user
        # would be useful in the future, and might not be contained in the already-starred words.
        accuracy = np.zeros(len(pred))
        for i in range(len(pred)):
            n = pred.shape[1]
            num_correct = 0
            for word in pred[i]:
                index = np.where(self.column_labels == word)
                if y_true[i,index] == 1:
                    num_correct += 1
            accuracy[i] = float(num_correct)/n
        return accuracy

In [15]:
# Test model

X_train, tups = train_test_split(test_size=100)
column_labels, X_train = encode(X_train,expanded = False)
start = time.time()
knn = KNNPredictor(5,column_labels).fit(X_train)
print("Train time:", time.time()-start)
start = time.time()
scores = []
for tup in tups:
    _, X_test = encode(tup[0],expanded = False,column_labels=column_labels)
    _, y_test = encode(tup[1],expanded = False,column_labels=column_labels)
    preds = knn.predict(X_test[:,:,0])
    score = knn.score(y_test[:,:,0], preds)
    if len(score) > 0:
        scores.append(score.item())
print("Prediction time for",len(tups),"users:", time.time()-start)
print("Accuracy:",np.mean(scores))
print(scores)

Train time: 0.008158206939697266
Prediction time for 100 users: 8.006340026855469
Accuracy: 0.1958762886597938
[0.8, 0.4, 0.0, 0.2, 0.0, 0.4, 0.2, 0.0, 0.0, 0.2, 0.0, 0.2, 0.0, 0.4, 0.6, 0.6, 0.2, 0.0, 0.6, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0, 0.4, 0.0, 1.0, 0.0, 0.0, 0.0, 0.4, 0.4, 0.2, 0.2, 0.4, 0.0, 0.2, 0.0, 0.0, 1.0, 0.2, 0.0, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0, 0.8, 0.2, 0.6, 0.4, 1.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0, 0.2, 1.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.2, 0.0, 0.8, 0.2, 0.0, 0.0, 0.4, 0.2, 0.4, 0.2, 0.0, 0.0, 0.0, 0.0]


In [14]:
# Test model

X_train, tups = train_test_split(test_size=1)
column_labels, X_train = encode(X_train,expanded = False)
start = time.time()
knn = KNNPredictor(5,column_labels).fit(X_train)
print("Train time:", time.time()-start)
start = time.time()
scores = []
for tup in tups:
    _, X_test = encode(tup[0],expanded = False,column_labels=column_labels)
    _, y_test = encode(tup[1],expanded = False,column_labels=column_labels)
    preds = knn.predict(X_test[:,:,0])
    scores.append(knn.score(y_test[:,:,1], preds)[0])
print("Prediction time for",len(tups),"users:", time.time()-start)
print("Accuracy:",np.mean(scores))
print(scores)

Train time: 0.007532596588134766
Prediction time for 1 users: 0.09897923469543457
Accuracy: 0.0
[0.0]
