## Before run

```
# create a python virtualenv for separating the dependencies
virtualenv virtualenv

# enter the virtualenv
source virtualenv/bin/activate

# install the dependencies
pip install -r requirements.txt
```

Please run the first frame.


# First, check that the dataset has been installed on your computer

In [1]:
import os

DATA_FOLDER = './data'

log = print

if not os.path.exists(DATA_FOLDER):
    os.mkdir(DATA_FOLDER)

if not os.path.exists(DATA_FOLDER + '/evaluation'):
    os.mkdir(DATA_FOLDER + '/evaluation')

if not os.path.exists(f'{DATA_FOLDER}/Info_UserData.csv'):
    import kaggle

    kaggle.api.authenticate()
    log('Download dataset...')
    kaggle.api.dataset_download_files('junyiacademy/learning-activity-public-dataset-by-junyi-academy',
                                      path=DATA_FOLDER, unzip=True)
    log('Dataset downloaded.')
else:
    log("The dataset is present on the machine.")

The dataset is present on the machine.


In [2]:
print('Files in the dataset: ')
for dirname, _, filenames in os.walk(DATA_FOLDER):
    for filename in filenames:
        print(os.path.join(filename))

Files in the dataset: 
Info_Content.csv
Info_UserData.csv
Log_Problem.csv
clusters.csv
clusters_.csv
clusters_all.csv
Info_Content.csv
Info_Content_subset.csv
Info_UserData.csv
Info_UserData_subset.csv
Log_Problem.csv
Log_Problem_subset - Copy.csv
Log_Problem_subset.csv
Log_Problem_subset.csv.bak
reduced.csv
eval_error_5splitss.txt
eval_mean_errors_5splits.txt
X_subset_w_level_id2_level_id3.pkl
y_subset_w_level_id2_level_id3.pkl
data_loader.cpython-310.pyc
data_preprocessing.cpython-310.pyc
data_visualization.cpython-310.pyc
feature_categorization.cpython-310.pyc


## Preprocess data
Includes functions to Ordinal Encode, One-hot encode, scale and perform feature engineering

In [3]:
import warnings
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler

from data_visualization import plot_columns_of_df

warnings.filterwarnings("ignore")


####Preproccesing steps
# 0. Load data
# 1. Categorize features and encode accordingly
# 2. Handle missing data
# 3. Visualize
# 4. Identify and remove outliers
# 5. Potentially enrich data with feature engineering

def scale(df, scaler="standard"):
    """
    Standardize data with scikit learn scaler
    :param df: dataframe with data to be scaled
    :param scaler: method of scaling
    :return: scaled df
    """
    if scaler == "standard":
        scaler = StandardScaler()  # z = (x - u) / s, where u is column mean, and s is std. dev
    elif scaler == "minmax":
        scaler = MinMaxScaler(feature_range=(0, 1))  # scales columns to given range.
    else:
        scaler = MaxAbsScaler()

    scaler.fit(df)
    scaled = scaler.fit_transform(df)
    return pd.DataFrame(scaled, columns=df.columns)


def unix_encode(df, column_names=[]):
    """
    Encode date to unix format (sec/min/hour since unix epoch), that can be used for clustering/prediction
    A kind of ugly implementation trying to speed up to_datetime by providing the relevant time formats
    (There is a 10-100x speed up to give correct format)
    :param df:
    :param column_names:
    :return:
    """
    time_columns = np.array([])
    t_formats = ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S %Z']
    for column in column_names:
        success = False
        # temp = (pd.to_datetime(df[column].iloc[:, 0]) - pd.Timestamp("1970-01-01"))
        for t_format in t_formats:
            if not success:
                try:
                    temp = pd.to_datetime(df[column], format=t_format)  # Takes 15 sec for subset on df_pr
                    success = True
                except:
                    print("incorrect time format {}".format(t_format))

        temp -= temp.min()
        temp //= pd.Timedelta('15m')
        if time_columns.shape[0] == 0:
            time_columns = np.array(temp)
        else:
            time_columns = np.vstack((time_columns, temp))

    return pd.DataFrame(time_columns.T, columns=column_names)


def ordinal_encode(df, column_names=[]):
    """
      ordinal-encode features
      :param df: dataframe with only the columns to be one-hot-encoded
      :param column_names: names of the columns to be one-hot-encoded
      :return: one-hot-encoded columns ready to be concatenated with original dataframe
      """
    encoded_data = np.array([])
    mappings = \
        {
            'difficulty': {'easy': 0, 'normal': 1, 'unset': 1, 'hard': 2},
            'learning_stage': {'elementary': 0, 'junior': 1, 'senior': 2}
        }
    for column in column_names:
        try:
            mapping = mappings[column]
        except:
            print(
                "Please add ordinal mapping to ordinal_dict in 'ordinal_encode'-function in data/data_preprocessing.py")
        temp = list(map(lambda x: mapping[x], df[column]))
        if encoded_data.shape[0] == 0:
            encoded_data = np.array(temp)
        else:
            encoded_data = np.vstack((encoded_data, temp))
    df_encoded = pd.DataFrame(encoded_data.T, columns=column_names)
    return df_encoded, column_names


def one_hot_encode(df, column_names=['gender', 'is_self_coach']):
    """
    one-hot-encode categorical features
    :param df: dataframe with only the columns to be one-hot-encoded
    :param column_names: names of the columns to be one-hot-encoded
    :return: one-hot-encoded columns ready to be concatenated with original dataframe
    """
    enc = OneHotEncoder()
    enc.fit(df)
    # print(enc.categories_)
    column_names = enc.get_feature_names(column_names)

    encoded_data = enc.transform(df).toarray()
    df_encoded = pd.DataFrame(encoded_data, columns=column_names)
    return df_encoded, column_names, enc


def extract_additional_user_features(df_u, df_problems, df_content):
    """
    intendend purpose: extract user features from problems. Potentially using map_reduce
    # exercise_per_level
    # avg_time_spend
    # avg_correct
    # problem_solved
    :return:
    """

    # Get features based on content
    problem_content = df_problems.merge(df_content)
    encoded_df, cols = ordinal_encode(problem_content[["difficulty", "learning_stage"]],
                                      column_names=["difficulty", "learning_stage"])
    problem_content[cols] = encoded_df[cols]
    problem_content_grouped = problem_content.groupby("uuid").agg({
        "is_correct": "mean",
        "total_sec_taken": "mean",
        "upid": "count",
        "level": ["mean", "max"],
        "used_hint_cnt": "mean",
        "difficulty": "mean",
        "learning_stage": "mean",
    })
    cols = [
        "correct_percentage",
        "time_spent",
        "problems_attempted",
        "average_level", "max_level",
        "average_hints",
        "avg_difficulty",
        "avg_learning_stage",
    ]
    problem_content_grouped.columns = cols

    users = df_u.merge(problem_content_grouped, left_on="uuid", right_on="uuid")

    return users


def _extract_additional_user_features(df_u, df_problems, df_content):
    """
    intendend purpose: extract user features from problems. Potentially using map_reduce
    # exercise_per_level
    # avg_time_spend
    # avg_correct
    # problem_solved
    :return:
    """

    os.system("python sandbox/MapReduceSandbox.py data/csv_files/Log_Problem.csv > data/csv_files/reduced.csv")
    reduced = pd.read_csv("data/csv_files/reduced.csv",
                          names=["uuid", "problems_attempted", "time_spent", "average_level", "correct_percentage",
                                 "max_level", "avg_learning_stage", "avg_difficulty", "average_hints"], index_col=False)


    users = df_u.merge(reduced, left_on="uuid", right_on="uuid")

    return users


def extract_additional_problem_features(df_ex):
    """
    :param df_ex:
    :return:
    """


def preprocess_df(df, o_features) -> pd.DataFrame:
    """

    :param df:
    :param o_features: object features as described in data/feature_categorization.py
    :return:
    """
    f, f_time, f_OR, f_OH, f_meta, f_scale = o_features.features, o_features.features_to_be_time_encoded, o_features.features_to_be_OR_encoded, o_features.features_to_be_OH_encoded, o_features.features_meta, o_features.features_to_be_scaled
    df_f, df_time, df_OR, df_OH, df_meta, df_scale = df[f], df[f_time], df[f_OR], df[f_OH], df[f_meta], df[f_scale]

    if f_time:  # if not empty then unix encode
        df_time = unix_encode(df_time,
                              f_time)  # Takes a significant amount of time to compute (for subset it is approx 20 seconds)

    if f_scale:
        df_scale = scale(df_scale)

    if f_OR:
        df_OR, column_names_OR = ordinal_encode(df_OR, f_OR)

    if f_OH:
        for column in df_OH.columns:
            if df_OH[column].hasnans:
                df_OH[column] = df_OH[column].astype('string').fillna("unspecified")

        df_OH, column_names_OH, enc_OH = one_hot_encode(df_OH, f_OH)
    dfs = [df_f, df_time, df_OR, df_OH, df_meta, df_scale]
    df_counter = 0
    for df_ in dfs:
        if df_.shape[1] != 0:
            df_counter += 1
            if df_counter == 1:
                df_final = df_
            else:
                df_final = pd.concat([df_final, df_], axis=1)
    return df_final


def remove_outliers_by_quantile(df, columns, quantiles):
    for column, quantile_ in zip(columns, quantiles):
        index_names = df[(df[column] > df[column].quantile(quantile_))].index
        df.drop(index_names, inplace=True)
    return df

class U_features():
    def __init__(self,
                 features=['user_grade', 'has_teacher_cnt', 'has_student_cnt', 'has_class_cnt',
                           "correct_percentage", "problems_attempted", "average_level", "max_level",
                           "average_hints", "avg_difficulty", "avg_learning_stage"],
                 features_to_be_time_encoded=['first_login_date_TW'], features_to_be_OR_encoded=[],
                 features_to_be_OH_encoded=['gender', 'user_city', 'is_self_coach'], features_meta=['uuid'],
                 features_to_be_scaled=['points', "correct_percentage", "time_spent", 'belongs_to_class_cnt',
                                        "badges_cnt"]):
        self.features = features
        self.features_to_be_time_encoded = features_to_be_time_encoded
        self.features_to_be_scaled = features_to_be_scaled
        self.features_to_be_OR_encoded = features_to_be_OR_encoded
        self.features_to_be_OH_encoded = features_to_be_OH_encoded
        self.features_meta = features_meta

# Load data
Now, we load data from the 3 files: Users, Problems and Content

In [4]:
def load_data_raw():
    df_u = pd.read_csv(f'{DATA_FOLDER}/Info_UserData.csv')
    df_pr = pd.read_csv(f'{DATA_FOLDER}/Log_Problem.csv')
    df_ex = pd.read_csv(f'{DATA_FOLDER}/Info_Content.csv')
    return df_u, df_pr, df_ex


def remove_problems_with_total_time_outliers(df_pr):
    limit = np.quantile(df_pr['total_sec_taken'], 0.98)
    new_df_pr = df_pr[df_pr['total_sec_taken'] < limit]
    return new_df_pr

# if `log` is undefined then please run the first frame
log("Loading data...")
df_u, df_pr, df_c = load_data_raw()

Loading data...


## Extract additional features

To improve the user clustering, we extract additional features

OBS: You can only run `USE_MAPREDUCE` if you have MrJob installed on your computer - and if you have the file `data/MapReduceSandbox.py` from the project. MapReduce takes ~20 times as long, so be warned.

In [25]:
df_pr = remove_problems_with_total_time_outliers(df_pr)

log("Feature extraction...")

USE_MAPREDUCE = False
if USE_MAPREDUCE:
    X = _extract_additional_user_features(df_u, df_pr, df_c)
else:
    X = extract_additional_user_features(df_u, df_pr, df_c)

Feature extraction...


In [8]:
X

Unnamed: 0,uuid,gender,points,badges_cnt,first_login_date_TW,user_grade,user_city,has_teacher_cnt,is_self_coach,has_student_cnt,belongs_to_class_cnt,has_class_cnt,problems_attempted,time_spent,average_level,correct_percentage,max_level,avg_learning_stage,avg_difficulty,average_hints
0,Y2RcCdmUJAYPUAIDElo4nE9KrkLLFzUIRdexG+ipaZQ=,,18300,1,2019-01-24,1,kh,0,False,0,0,0,1,43.000000,1.000000,1.000000,1.0,1,0.0,1.000000
1,J7Tbo1x2WtRpPuXeX7lWT9tkzWlSJeubl8UWjNmHh+4=,,15525,1,2019-01-24,2,ntpc,0,False,0,0,0,1,31.000000,0.000000,1.000000,1.0,0,0.0,0.000000
2,6Dm50WJCc9EQjr4Ar8SPukhnsTeS+kwX+9FyUI1o57k=,,101353,11,2019-01-24,2,kh,1,False,0,1,0,3,22.666667,0.333333,0.666667,1.0,1,0.0,0.333333
3,ADL8ZENEcGUW3bCQg4t8gA0tJR9R6H5OYwr+TCPb5oY=,male,7503,0,2019-01-24,2,kh,1,False,0,1,0,1,32.000000,0.000000,1.000000,1.0,0,0.0,0.000000
4,5eCvpCJiXsYg8jNhQAz8JltX6G0LSWpFb86a2GVuinA=,,12825,2,2019-01-24,3,ntpc,1,False,0,1,0,1,9.000000,0.000000,1.000000,1.0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54299,yY0/JhgPot0/jgNGGKXiM+r47EA6HyxjxL5HBCe5cJE=,,6300,0,2019-01-23,9,ylc,1,False,0,0,0,2,55.000000,0.000000,1.000000,1.0,0,1.0,0.000000
54300,1ISypO5ve8GgsJ2bgEW87eqdOIEAEqPfWeea52SndWo=,male,32937,7,2019-01-23,9,ttct,1,False,0,1,0,1,15.000000,0.000000,1.000000,1.0,0,1.0,0.000000
54301,xoUBGAzyfHBkDa/YjwWn3zpZCzr3FUpAFd8qMj2fIcA=,male,17453,3,2019-01-23,9,chc,1,False,0,1,0,1,35.000000,0.000000,0.000000,1.0,0,1.0,1.000000
54302,ESAeVlIpxgtgb2EtZb8K+plFLqKch5ffAgRzKnQqTQ4=,,2393,2,2019-01-23,10,tc,1,False,0,1,0,2,15.000000,0.500000,1.000000,1.0,1,1.0,0.000000


## Preprocess data

In [9]:
log("Preprocessing...")
user_features = U_features()
X: pd.DataFrame = preprocess_df(df=X, o_features=user_features)

Preprocessing...


## Custom Cure Implementation
Below is our custom CURE implementation, which will be used to perform clustering.

In [10]:
import numpy as np,random
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics.pairwise import pairwise_distances
random.seed(3)
np.random.seed(3)
""" Sample size for initial clustering """
CURE_SAMPLE = lambda size: int(size / 5) if size < 1000 else int(size / 50)

""" Number of (calculated, random) representative points """
CURE_REPRES = lambda sample_len: (10, 5)  # (calculated, random)


def random_data_split(data: np.ndarray, n: int):
    maxi = len(data)
    indices = np.random.choice(maxi, n, replace=False)

    def inverze():
        mask = np.full(maxi, True)
        mask[indices] = False
        return mask

    return data[indices], data[inverze()]


def _cure_initial_clustering(sample: np.ndarray):
    """Initial clustering
    idea:
    - do a clustering
    - drop tiny clusters
    - calculate average distances in good-looking one
    - do the sample clustering with DBSCAN
    """

    def eps_calculation():
        n_clusters = 4  # magic number
        labels = KMeans(n_clusters=n_clusters, random_state=0, max_iter=500).fit(sample).labels_
        partition = lambda k: sample[labels == k]

        def cluster_averages():
            for k in range(labels.max() + 1):
                ds = np.triu(pairwise_distances(partition(k)))
                yield np.average(ds[ds != 0])

        averages = sorted(list(cluster_averages()))
        arbitrary_choice = averages[1]
        return arbitrary_choice

    # intentionally not using AgglomerativeClustering
    sample_len = len(sample)
    correction = 0.5 if sample_len > 700 else 0.3
    eps = correction * eps_calculation()
    labels = DBSCAN(eps=eps, min_samples=10).fit(sample)
    return labels.labels_


def cure_classify(x: np.ndarray, representatives: np.ndarray) -> np.ndarray:
    labels = np.ndarray(shape=(len(x),), dtype=int)

    rss = np.concatenate(representatives)
    indexed_rs = [
        partition
        for partition, rs in enumerate(representatives)
        for _ in rs
    ]

    dss = pairwise_distances(x, rss)
    for i in range(len(x)):
        labels[i] = indexed_rs[dss[i].argmin()]

    return labels


def cure_representatives(x: np.ndarray):
    # Checked initial clustering
    while True:  # retry mechanism
        try:
            sample_num = CURE_SAMPLE(len(x))
            sample, remainder = random_data_split(x, sample_num)
            sample_labels = _cure_initial_clustering(sample)
            if sample_labels.max() >= 1:  # sanity check
                break
            else:
                print("Inappropriate initial clusters, retrying")
        except ValueError:
            print("Failed EPS calculation, retrying")

    sample_partition = lambda k: sample[sample_labels == k]
    centroid = lambda x: np.average(x, axis=0)

    def calc_borderline(x: np.ndarray, desired: int):
        cent = centroid(x)
        # centroid <-> point distances
        cent_point_dist: list[tuple[np.ndarray, float]] = sorted(
            zip(x, pairwise_distances(x, [cent]).flatten()),
            key=lambda p: p[1]
        )

        # furthest points from centroid
        borderline: list[np.ndarray] = [cent_point_dist.pop()[0]]

        half = int(len(cent_point_dist) / 2)
        # an educated guess to reduce the number of points
        cent_point_dist = cent_point_dist[half:]

        while len(borderline) < desired:
            def repres_distance(point):  # point <-> borderline "distances"
                return pairwise_distances(borderline, [point]).sum()

            ds = list(map(lambda pair: (pair[0], repres_distance(pair[0])), cent_point_dist))
            ds = sorted(ds, key=lambda p: p[1])
            borderline.append(ds[-1][0])

        for vec in borderline:
            vec_towards_centroid = cent - vec
            yield vec + vec_towards_centroid / 4

    def representatives(x) -> np.ndarray:
        borderline_num, randoms_num = CURE_REPRES(len(x))
        randoms, _ = random_data_split(x, randoms_num)
        borderline = list(calc_borderline(x, borderline_num))
        return np.concatenate([randoms, borderline])

    rs = [
        representatives(sample_partition(k))
        for k in range(sample_labels.max() + 1)
    ]

    return rs


## Splitting users
Before clustering, users are split into 5 categories. Afterwards, the CURE clustering is run.

In [11]:
def split_users_by_grade(df):
    split1 = df[df["user_grade"] < 4]
    split2 = df[df["user_grade"] == 5]
    split3 = df[df["user_grade"] == 6]
    split4 = df[df["user_grade"] == 7]
    split5 = df[df["user_grade"] > 7]
    return [split1, split2, split3, split4, split5], ["split1", "split2", "split3", "split4", "split5"]


def cure_clustering(df: pd.DataFrame):
    data = df.drop(columns=["uuid"]).to_numpy()
    cure_repres = cure_representatives(data)
    cluster_labels = cure_classify(data, cure_repres)
    n_clusters = cluster_labels.max() + 1
    partition = lambda k: data[cluster_labels == k]
    similarities = [pairwise_distances(partition(k)) for k in range(n_clusters)]
    sim_users = [df["uuid"][cluster_labels == k] for k in range(n_clusters)]
    return cluster_labels, similarities, sim_users


USE_USER_USER_SIMILARITY = True
all_split_labels = []
all_split_similarities = []
all_split_sim_users = []

log("Splitting users...")
dfs, labels = split_users_by_grade(X)
del X

log('Cluster each instance of the split...')
for df, label in zip(dfs, labels):
    log("Getting clusters for {} users...".format(label))
    cluster_labels, similarities, sim_users = cure_clustering(df)
    all_split_labels.append(cluster_labels)
    all_split_similarities.append(similarities)
    all_split_sim_users.append(sim_users)

Splitting users...
Cluster each instance of the split...
Getting clusters for split1 users...
Getting clusters for split2 users...
Getting clusters for split3 users...
Getting clusters for split4 users...
Getting clusters for split5 users...


## Evaluate clusters
Now, we evaluate the clusters based on Davies-Bouldin score and centroid differences.

In [14]:
from sklearn.metrics import davies_bouldin_score


def evaluate_clusterings(combined):
    for df, split, labels in combined:
        df = df.drop(columns=["uuid", "first_login_date_TW"])
        data = df.to_numpy()
        log(f"Split: {split} | Davies-Bouldin score: ", davies_bouldin_score(data, labels))
        centroids = [np.average(data[labels == k], axis=0) for k in range(labels.max() + 1)]

        def interesting_columns(x: np.ndarray):
            return list(filter(lambda pair: pair[1] > 0.07, zip(df.columns, x)))

        c0 = centroids[0]
        log(f"Centroid differences (for {len(centroids)} clusters):")
        for i in range(1, min(4, len(centroids))):
            c1 = centroids[i]
            diff = interesting_columns(c0 - c1)
            diff = sorted(diff, key=lambda p: p[1], reverse=True)[:3]
            log(diff)
        log('---')

log('Evaluate clusters in each split')
log('---')
evaluate_clusterings(zip(dfs, labels, all_split_labels))

Evaluate clusters in each split
---
Split: split1 | Davies-Bouldin score:  20.03571091366961
Centroid differences (for 5 clusters):
[('problems_attempted', 0.8307715974445031), ('has_student_cnt', 0.32342093215925327), ('gender_unspecified', 0.08641757932175942)]
[('problems_attempted', 12.31790686026934), ('avg_learning_stage', 0.5623079382115996), ('has_student_cnt', 0.4670455544888016)]
[('problems_attempted', 5.9774523263262775), ('avg_learning_stage', 0.6530265253198464), ('has_student_cnt', 0.6261284619686445)]
---
Split: split2 | Davies-Bouldin score:  6.580712555735007
Centroid differences (for 5 clusters):
[('problems_attempted', 17.489206333761487), ('avg_learning_stage', 0.5153645812063183), ('has_student_cnt', 0.4990898298518053)]
[('problems_attempted', 25.456553252496217), ('avg_learning_stage', 0.8029607965847395), ('points', 0.7268861069679096)]
[('problems_attempted', 23.776854747985773), ('avg_learning_stage', 0.8077654176017228), ('points', 0.5112431032542564)]
---
S

## Recommender system
Below is the code used for the recommender system

In [15]:
from tqdm import tqdm
from collections import defaultdict

def construct_util_matrix(users,problems,test_index):
    unique_problems = problems['upid'].unique()
    M = np.empty((len(users),len(unique_problems)))
    M_test = np.zeros_like(M)
    for i, userID in enumerate(tqdm(users['uuid'],desc="Func: construct_util_matrix()")):
        user_problems = problems.loc[problems['uuid'] == userID]
        if not user_problems.empty:
            max_time = max(user_problems['total_sec_taken'])
            user_prob_idx = []
            user_difficulties = []
            for j in user_problems.index.to_list():
                problem_ID = user_problems.loc[[j],'upid'].values[0]
                problem_index = np.where(unique_problems == problem_ID)[0][0]
                user_prob_idx.append(problem_index)
                time = user_problems.loc[[j],'total_sec_taken'].values[0]
                is_correct = user_problems.loc[[j],'is_correct'].values[0]
                user_difficulties.append(get_difficulty(time,max_time,is_correct))
                M[i, problem_index] = user_difficulties[-1]

            #standardize difficulty values between [0,1]
            min_ = np.min(user_difficulties)
            max_ = np.max(user_difficulties)
            M[i,user_prob_idx] = (M[i,user_prob_idx] - min_) / (max_)
            # Save 'true' difficulty measures and remove from utility matrix
            for idx, prob_idx in enumerate(user_problems.index.to_list()):
                if prob_idx in test_index:
                    M_idx = user_prob_idx[idx]
                    M_test[i, M_idx] = M[i, M_idx]
                    M[i, M_idx] = 0.0
        else:
            #M = np.delete(M, i, axis=0)
            pass
    return M, M_test, unique_problems

def get_difficulty(time,max_time,is_correct,alpha=0.8):
    """
    :param time: Time (sec) spent on a given problem for a given student
    :param max_time: Maximum time (sec) a given student has used on any problem
    :param is_correct: boolean if the student solved the given problem (1) or not (0)
    :param alpha: time importance weight
    :defined in func beta: correctness importance weights (Note weights should sum to one)
    :return: Relative difficulty of a problem for a given student
    """
    beta = 1-alpha
    difficulty = 1-((max_time - time) / max_time)*alpha - beta*is_correct
    return difficulty

def split_data(problems):
    # Sort problems by timestamp - make sure that we only consider problems for the relevant users!
    pr_user = problems.sort_values(by='timestamp_TW',ascending=False)
    # Find out how many problems each user have attempted
    prob_per_user = pr_user.groupby('uuid').size()
    # Find 80 % quantile of how many problems each user have attempted
    cutoff_problems = np.sort(prob_per_user.to_numpy())[int(len(prob_per_user)*0.8)]
    # Index of all users that have attempted 'cutoff_problems' or more problems
    user_idx = prob_per_user[prob_per_user>=cutoff_problems].index
    # Find out how many users have attempted each problem
    solved_per_prob = pr_user.groupby('upid').size()
    # Find 80 % quantile of how many users have attempted each problem
    cutoff_users = np.sort(solved_per_prob.to_numpy())[int(len(solved_per_prob) * 0.8)]
    # Index of all problems where cutoff_users or more users have attempted the problem
    prob_idx = solved_per_prob[solved_per_prob>=cutoff_users].index
    # Filter data frame on both users and problems
    user_sample = pr_user[pr_user['uuid'].isin(user_idx)]
    prob_sample = user_sample[user_sample['upid'].isin(prob_idx)]
    # Sample the last 5 attempted problems for each filtered user
    test_index = prob_sample.groupby('uuid').head(int(cutoff_problems/5)).index.to_list()
    return test_index

def get_utility_matrix_shape(clusters,df_pr,subset=False):
    if subset:
        problems = defaultdict(list)
        for userID in tqdm(clusters['uuid'],desc="Func: get_utility_matrix_shape()"):
            user_info = df_pr.loc[df_pr['uuid'] == userID]
            # for problem in user_info['upid']:
            if user_info['upid'].shape[0]:
                problems[str(user_info['upid'])] = 1
        sub_problems = df_pr['upid'].isin(problems)
        df_pr_sub = df_pr.loc[sub_problems]
        return clusters.shape[0], len(df_pr_sub['upid'].unique())
    else:
        return clusters.shape[0], len(df_pr['upid'].unique())


def generate_utility_matrix_for_one_cluster(clusters,cluster_id,df_u_full,df_pr_full):
    cluster = clusters.loc[clusters['labels'] == cluster_id]
    sub_users = df_u_full['uuid'].isin(cluster['uuid'].to_list())
    df_u_sub = df_u_full.loc[sub_users]

    #problems = []
    # for userID in tqdm(df_u_sub['uuid'],desc="Func: generate_utility_matrix_for_one_cluster()"):
    #     user_info = df_pr_full.loc[df_pr_full['uuid'] == userID]
    #     for problem in user_info['upid']:
    #         problems.append(problem)
    # sub_problems = df_pr_full['upid'].isin(problems)
    # df_pr_sub = df_pr_full.loc[sub_problems]
    df_pr_sub = df_pr_full.loc[df_pr_full['uuid'].isin(df_u_sub['uuid'])]
    test_index = split_data(df_pr_sub)
    M, M_test, unique_prob_ids = construct_util_matrix(df_u_sub, df_pr_sub,test_index)
    return M, M_test, df_u_sub['uuid'], unique_prob_ids


def get_psedu_problem_difficulties(M, M_test,user_user_similarities_matrix,use_user_user_similarity):
    errors = []
    recommendations = np.zeros_like(M)  # Create copy of utility matrix, so that 'predictions' are not used when aggregating
    for user_idx in tqdm(range(np.shape(M)[0]),desc="Func: get_psedu_problem_difficulties()"):
        user_similarity_vec = user_user_similarities_matrix[user_idx,:]
        recommendations[user_idx, :], error = get_psedu_problem_difficulties_for_single_user(user_idx, M, M_test,user_similarity_vec,use_user_user_similarity)
        errors.append(error)
    return recommendations, errors



def get_psedu_problem_difficulties_for_single_user(user_idx, M, M_test,user_sim_vector,use_user_user_similarity=False):
    # Now that we have a utility matrix, we need to fill all empty entries
    #M2 = np.copy(M)  # Create copy of utility matrix, so that 'predictions' are not used when aggregating
    n_user = np.shape(M)[0]
    n_problems = np.shape(M)[1]
    user_recommendations = np.zeros(n_problems)
    unsolved_problems = np.argwhere( M[user_idx, :] == 0.0)
    unsolved_problems = [u[0] for u in unsolved_problems]
    relevant_user_ids = list(range(0,user_idx))+list(range(user_idx+1,n_user))
    relevant_M = M[relevant_user_ids,:][:,unsolved_problems]
    if use_user_user_similarity:
        #We need to turn distance measures into similarity, which is done by subtracting the distance from the maximum distance and normalizing
        relevant_user_sim_vector = user_sim_vector[relevant_user_ids].max()-user_sim_vector[relevant_user_ids]
        relevant_user_sim_vector =  ( relevant_user_sim_vector - np.min(relevant_user_sim_vector) ) / np.std(relevant_user_sim_vector)
        #Next we want want to weight the non-zero inputs, such that the weights are equal to 1, to keep the scale of the difficuly.
        weight_scale =  relevant_user_sim_vector @ (relevant_M > 0)
        user_recommendations[unsolved_problems] = np.dot(relevant_user_sim_vector,relevant_M) / weight_scale #np.sum(relevant_M != 0, axis=0)
    else:
        user_recommendations[unsolved_problems] = np.sum(relevant_M, axis=0) / np.sum(relevant_M != 0, axis=0)

    user_recommendations[np.isnan(user_recommendations)] = 0
    test_idx = M_test[user_idx, :].nonzero()
    errors = np.abs(user_recommendations[test_idx]-M_test[user_idx,test_idx])
    return user_recommendations, errors

def get_recommendation(difficulty_matrix,quantile=0.80,recommendations_to_return = 1):
    if len(difficulty_matrix.shape) == 1:
        difficulty_matrix = np.reshape(difficulty_matrix,(1,-1))

    n_problems = np.min(np.sum(~np.isnan(difficulty_matrix),axis=1))
    sorted_indices = np.argsort(difficulty_matrix,axis=1)

    quantile_idx = int(n_problems*quantile)

    #indices of problems being recommended (based on the initial ordering in difficulty_matrix)
    problem_indices = sorted_indices[:,quantile_idx:quantile_idx+recommendations_to_return]
    #test = difficulty_matrix[:,problem_indices]
    difficulties_of_recommendations = [difficulty_matrix[user_idx, prob_idx] for (user_idx, prob_idx) in enumerate(problem_indices)]
    return difficulties_of_recommendations, problem_indices

## Create utility matrix based on clustering and problem,user pairs

In [16]:
def bind_labels_and_uuid(cluster_labels, sim_users):
    cluster_partitions = [cluster_labels[cluster_labels == i] for i in range(cluster_labels.max() + 1)]
    clusters = pd.DataFrame([np.hstack(cluster_partitions), np.hstack(sim_users)]).T
    clusters.columns = ["labels", "uuid"]
    return clusters


log("Binding clusters labels and uuids for all splits")
all_segment_clusters = [bind_labels_and_uuid(c_labels, s_users) for (c_labels, s_users) in
                        zip(all_split_labels, all_split_sim_users)]


def run_and_evaluate_recommender_system(clusters, df_pr, df_u, user_user_similarities, cluster_id=0,
                                        use_user_user_similarity=False):
    M, M_test, U1_ids, P1_ids = generate_utility_matrix_for_one_cluster(clusters=clusters, df_u_full=df_u,
                                                                        df_pr_full=df_pr, cluster_id=cluster_id)

    cluster_user_user_similarity = user_user_similarities[cluster_id]

    difficulties_for_all_users, errors_all = get_psedu_problem_difficulties(M, M_test, cluster_user_user_similarity,
                                                                            use_user_user_similarity)
    errors = [item[0] for sublist in errors_all for item in sublist if len(item) > 0]
    mean_abs_error = np.mean(errors)
    recommendation_difficulty_for_all_users, recommendation_idx_all = get_recommendation(difficulties_for_all_users)
    num_users = clusters.loc[clusters['labels'] == cluster_id].shape[0]
    print("Mean absolute error of difficulty was {} for cluster {} with {} users and use_similarrity={}".format(
        mean_abs_error, cluster_id, num_users, str(use_user_user_similarity)))
    return mean_abs_error, errors, recommendation_difficulty_for_all_users, recommendation_idx_all, np.mean(
        difficulties_for_all_users[difficulties_for_all_users > 0])


# Run all splits, and all clusters
mean_errors = []
for split_idx, clusters_ in enumerate(all_segment_clusters):
    df_u_split = dfs[split_idx]
    df_p_split = df_pr.loc[df_pr['uuid'].isin(df_u_split['uuid'])]
    similarities_ = all_split_similarities[split_idx]
    for cluster_idx in tqdm(range(len(similarities_)), desc="running cluster"):
        mean_abs_error, errors, recommendation_difficulty_for_all_users, recommendation_idx_all, mean_difficulty = run_and_evaluate_recommender_system(
            clusters_, df_p_split, df_u_split, similarities_, cluster_idx, USE_USER_USER_SIMILARITY)
        mean_errors.append(mean_abs_error)
        with open(f'{DATA_FOLDER}/evaluation/eval_mean_errors_5splits.txt', 'a') as f:
            f.write(
                "split_id: {},split_size: {}, cluster_id: {},cluster_u_size {}, n_errors: {}, mean_error: {}, mean_difficulty: {}, mean_recommendation_difficulty: {}\n".format(
                    split_idx, df_u_split.shape[0], cluster_idx, similarities_[cluster_idx].shape[0],
                    len(errors), np.round(mean_abs_error, 5), np.round(mean_difficulty, 5),
                    np.round(np.mean(recommendation_difficulty_for_all_users), 5)))
        with open(f'{DATA_FOLDER}/evaluation/eval_error_5splitss.txt', 'a') as f:
            f.write("split_id: {}, cluster_id: {}, errors {}\n".format(split_idx, cluster_idx, errors))
print("Mean absolute errors for the different splits {}".format(mean_errors))

Binding clusters labels and uuids for all splits


running cluster:   0%|          | 0/5 [00:02<?, ?it/s]

KeyboardInterrupt



Finally, we see an error rate of ~0.1