In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
import random
p = 0.1  # 10% of the lines
# keep the header, then take only 10% of lines
# if random from [0,1] interval is greater than 0.1 the row will be skipped
df = pd.read_csv(
         'learning_traces.13m.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1285720 entries, 0 to 1285720
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   p_recall           1285720 non-null  float64       
 1   timestamp          1285720 non-null  datetime64[ns]
 2   delta              1285720 non-null  int64         
 3   user_id            1285720 non-null  object        
 4   learning_language  1285720 non-null  object        
 5   ui_language        1285720 non-null  object        
 6   lexeme_id          1285720 non-null  object        
 7   lexeme_string      1285720 non-null  object        
 8   history_seen       1285720 non-null  int64         
 9   history_correct    1285720 non-null  int64         
 10  session_seen       1285720 non-null  int64         
 11  session_correct    1285720 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(5)
memory usage: 127.5+ MB


In [3]:
# before we try to learn good values for theta 
# we need to construct x

# x = information a students history learning a certain word

df.iloc[0,:]

p_recall                                          1.0
timestamp                         2013-02-28 18:28:01
delta                                        27649635
user_id                                          u:FO
learning_language                                  de
ui_language                                        en
lexeme_id            35a54c25a2cda8127343f6a82e6f6b7d
lexeme_string                mann/mann<n><m><sg><nom>
history_seen                                        5
history_correct                                     4
session_seen                                        1
session_correct                                     1
Name: 0, dtype: object

In [8]:
# (df['p_recall'] == (df['session_correct'])/(df['session_seen'])).sum() == df.shape[0]

# p_recall is the ratio of session_correct/session_seen

# p_recall is "y" "ground truth"

# predicted_p_recall is "y_hat" "prediction"

# error(p_recall,predicted_p_recall) <- we want this to be as small as possible

# if we can very reliably predict p_recall, what is the value of this in real-life terms?


False

In [4]:
# get parts of speech

def lexeme_df(filename):

    import re
    df_single_col = pd.read_csv(filename, delimiter='\t', header=None, names=['line'])

    def split_line(line):
        parts = re.split(r'\s+', line, maxsplit=2)
        if len(parts) == 3:
            return parts
        return [None, None, None]

    df_split = df_single_col['line'].apply(split_line)
    df = pd.DataFrame(df_split.tolist(), columns=['lexeme', 'category', 'meaning'])

    return df

In [5]:
lexemes = lexeme_df('lexeme_reference.txt')
lexemes.head()

Unnamed: 0,lexeme,category,meaning
0,aa,animacy,Animate
1,acr,adjective,Acronym
2,adj,POS,Adjective
3,adv,POS,Adverb
4,al,other,Other (altre)


In [6]:
lexemes['lexeme'].nunique()

92

In [65]:
# one-hot encoding
# dummy variables / indicator variables

df.loc[0,"lexeme_string"]

"<det><def><nt><sg><nom>"

# det, df, nt, sg, nom + 87 more 

# word | det | def | nt | sg | nom | ...
# das  |  1  | 1   | 1  | 1  |  1  | 0 ...



# look for the first <, remove everything to the left
# then remove <, >

# 

def extract_right_of_lt(text):
    import re
    match = re.search(r'<(.*)', text)
    return match.group(1) if match else ''




0               n><m><sg><nom>
1             prn><itg><m><sg>
2          vblex><pri><p2><sg>
3                    n><f><sg>
4          vblex><pri><p3><pl>
                  ...         
1285716         n><m><pl><nom>
1285717                   adv>
1285718    vblex><pri><p3><sg>
1285719          det><def><sp>
1285720           vblex><pres>
Name: lexeme_string, Length: 1285720, dtype: object

In [70]:
words = df['lexeme_string'].apply(extract_right_of_lt)
words = words.str.replace("<"," ")
words = words.str.replace(">","")
words = words.str.replace("*","")
words = words.str.replace("/","")
words

0               n m sg nom
1             prn itg m sg
2          vblex pri p2 sg
3                   n f sg
4          vblex pri p3 pl
                ...       
1285716         n m pl nom
1285717                adv
1285718    vblex pri p3 sg
1285719         det def sp
1285720         vblex pres
Name: lexeme_string, Length: 1285720, dtype: object

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized_words = vectorizer.fit_transform(words)

In [72]:
len(vectorizer.get_feature_names_out())

3138

In [None]:
# no longer optimizing this for now,
# i want to talk about specifying the model
# we'll return to this

In [73]:
# should start with a really simple feature vector
# (history_seen, history_correct)

simple_df = df[['p_recall','history_seen','history_correct']]

# input_dim = 2

In [None]:
# h_hat = 2**(theta*x)

# we need an error function to quantify how wrong we are

# (p_recall - predicted_p_recall)**2 <- error
# predicted_p_recall = 2**(-1*(delta/predicted_half_life))
# predicted_half_life = 2**(theta*x)


# given
# p_recall is given
# delta is given
# x is given

# we don't have theta

# we are going initialize theta with some random numbers
# so then we have theta

# once we have theta we can calculate the error
# and start learning

# to-do

# write formulas for predicted half_life and predicted p_recall
# write formula for loss function
# and import an optimizer 
# run the optimizer on the loss function + our data

In [None]:
# going to use pytorch
# we are using custom loss function and our model is not one of the standard ML models, like linear regression 

In [74]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
class SpacedRepetition(nn.Module):
    def __init__(self, input_dim):
        super(SpacedRepetition, self).__init__()
        self.theta = nn.Linear(input_dim, 1, bias=False)

    def forward(self, x):
        # estimating h_hat = 2^(theta . x)
        theta_x = self.theta(x)  # dot product of theta and x
        h_hat = torch.pow(2, theta_x)
        return h_hat

def custom_loss(p, p_hat, h, h_hat, theta, alpha, lambda_reg):
    loss_p = torch.mean((p_hat - p) ** 2)
    loss_h = torch.mean((h_hat - h) ** 2)
    reg_term = lambda_reg * torch.sum(theta ** 2)
    total_loss = loss_p + alpha * loss_h + reg_term
    return total_loss

In [None]:
# object oriented programming

# python is a very flexible programming language

# one thing it lets you do is define things called Classes

# suppose you are a game developer
# designing a world for your game
# and your world has trees
# you write a tree class
# tree class defines what attributes trees can have
# tree: height, color, bears_fruit, number of leaves, ...

# tree_A = tree(height = 100, color = green, bears_fruit = false, number of leaves = 1800)

# tree class exists
# new class called MagicTrees

# class MagicTree(Tree):
# super(MagicTree, self).__init__()

# MagicTree is called a sub-class of Tree
# Tree is a superclass of MagicTree

# linear regression is implemented as a class
# result = smf.ols("y ~ x",data=df)
# result.summary
# result.params
# ....

# class SpacedRepetition(nn.Module)
# super(SpacedRepetition, self).__init__()

# inheritance


# __init__(self, input_dim) "dunder method" "double underscore method"


In [None]:
mymodel = SpacedRepetition(2)