**Steps need to be done for preprocessing:** <br>
1: Remove the students whose activities are below a threshold<br>
2: Divide the dataset into train and test set<br>
3: Create a Q-matrix<br>

In [None]:
import traceback
def prvar(__x):
    print(traceback.extract_stack(limit=2)[0][3][6:][:-1],"=",__x)

import numpy as np
import pandas as pd
import math
import csv
import os
import json

from scipy import sparse

In [None]:
def prepare_kddcup10(folder_name, course_name, train_file, test_file, kc_col_name, min_interactions_per_user, remove_nan_skills, verbose,\
                    drop_duplicates=True):
    '''
    Reading input files
    Drop rows for which topic is not determined (it does not happen for RiPPLE files)
    Return the pre-processed file and Q-matrix.
    
    Arguments:
    folder_name -- path to the folder containig kdd files (algebra05, bridge_algebra06)
    course_name -- name of the course for which pre_processing is executed
    train_file -- original train_file provided by KDD cup organizers
    test_file -- original test_file provided by KDD cup organizers
    kc_col_name -- Skills id column
    min_interactions_per_user -- minimum number of interactions per student
    drop_duplicates -- if True, drop duplicates from dataset
    
    Outputs:
    data -- preprocessed dataset (pandas DataFrame)
    Q_mat -- corresponding q-matrix (item-skill relationships sparse array)
    '''
    if not os.path.exists(folder_name):
        print("The provided path for the data is invalid and the function will not be executed.")
        return None, None, None, None
        
    # reading csv file containing information about students' practice (attempt) history
    # from the train and test file provided by KDD organizer and then concatante them.
    train_file_path = folder_name  +'/'+  course_name  +'/'+ train_file
    df_train = pd.read_csv(train_file_path, delimiter='\t').rename(columns={
        'Anon Student Id': 'user_id',
        'Problem Name': 'pb_id',
        'Step Name': 'step_id',
        kc_col_name: 'kc_id',
        'First Transaction Time': 'timestamp',
        'Correct First Attempt': 'correct'
    })[['user_id', 'pb_id', 'step_id' ,'correct', 'timestamp', 'kc_id']]
    if verbose:
        initial_shape = df_train.shape[0]
        print("Opened KDD Cup 2010 data. Output: {} samples.".format(initial_shape))
    test_file_path = folder_name  +'/'+  course_name  +'/'+ test_file
    df_test = pd.read_csv(test_file_path, delimiter='\t').rename(columns={
        'Anon Student Id': 'user_id',
        'Problem Name': 'pb_id',
        'Step Name': 'step_id',
        kc_col_name: 'kc_id',
        'First Transaction Time': 'timestamp',
        'Correct First Attempt': 'correct'
    })[['user_id', 'pb_id', 'step_id' ,'correct', 'timestamp', 'kc_id']]
    if verbose:
        initial_shape = df_test.shape[0]
        print("Opened KDD Cup 2010 data. Output: {} samples.".format(initial_shape))
    df_train['group'] = 'train'
    df_test['group'] = 'test'
    frames = [df_train, df_test]
    data = pd.concat(frames)
    del df_train
    del df_test
    #removing rows with empty value for KC from our dataframe
    if remove_nan_skills:
        data = data[~data["kc_id"].isnull()]
        if verbose:
            print("Removed {} samples with NaN skills.".format(data.shape[0]-initial_shape))
            initial_shape = data.shape[0]
    else:
        data.loc[data["kc_id"].isnull(), "kc_id"] = 'NaN'
    
    data = data[data['correct'].isin([0,1])] # Remove potential continuous outcomes
    if verbose:
        print("Removed {} samples with non-binary outcomes.".format(data.shape[0]-initial_shape))
        initial_shape = data.shape[0]
    data['correct'] = data['correct'].astype(np.int32) # Cast outcome as int32
    
    
    data = data.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user)
    if verbose:
        print('Removed {} samples (users with less than {} interactions).'.format((data.shape[0]-initial_shape,
                                                         min_interactions_per_user)))
        initial_shape = data.shape[0]

    # Create variables
    data["item_id"] = data["pb_id"]+":"+data["step_id"]
    data = data[['user_id', 'item_id', 'kc_id', 'correct', 'timestamp', 'group']]
        
        
    # Transform ids into numeric
    data["item_id"] = np.unique(data["item_id"], return_inverse=True)[1]
    data["user_id"] = np.unique(data["user_id"], return_inverse=True)[1]

    # Create list of KCs
    listOfKC = []
    for kc_raw in data["kc_id"].unique():
        for elt in kc_raw.split('~~'):
            listOfKC.append(elt)
    listOfKC = np.unique(listOfKC)

    dict1_kc = {}
    dict2_kc = {}
    for k, v in enumerate(listOfKC):
        dict1_kc[v] = k
        dict2_kc[k] = v

    #df.reset_index(inplace=True, drop=True) # Add unique identifier of the row
    #df["inter_id"] = df.index

    # Build Q-matrix
    Q_mat = np.zeros((len(data["item_id"].unique()), len(listOfKC)))
    item_skill = np.array(data[["item_id","kc_id"]])
    for i in range(len(item_skill)):
        splitted_kc = item_skill[i,1].split('~~')
        for kc in splitted_kc:
            Q_mat[item_skill[i,0],dict1_kc[kc]] = 1
    if verbose:
        print("Computed q-matrix. Shape: {}.".format(Q_mat.shape))

    data = data[['user_id', 'item_id', 'timestamp', 'correct', 'kc_id', 'group']]

    data['timestamp'] =  pd.to_datetime(data['timestamp'])#, dayfirst=True)
    data.sort_values(by="timestamp", inplace=True) #first, timestamp should be converted to datetime
    data.reset_index(inplace=True, drop=True)    
        
    # Remove potential duplicates
    data.drop_duplicates(subset= ['user_id', 'item_id', 'timestamp', 'correct', 'kc_id'], inplace=True)
    data.reset_index(inplace=True, drop=True)   
    if verbose:
        print("Removed {} duplicated samples.".format(data.shape[0] - initial_shape))
        initial_shape = data.shape[0]
        
    train_set = data[data['group'] == 'train']
    train_set.reset_index(inplace=True, drop=True)
    train_set['timestamp'] =  pd.to_datetime(train_set['timestamp'])#, dayfirst=True)
    train_set.sort_values(by="timestamp", inplace=True) #first, timestamp should be converted to datetime
    train_set.reset_index(inplace=True, drop=True)
    
    test_set = data[data['group'] == 'test']
    test_set.reset_index(inplace=True, drop=True)
    test_set['timestamp'] =  pd.to_datetime(test_set['timestamp'])#, dayfirst=True)
    test_set.sort_values(by="timestamp", inplace=True) #first, timestamp should be converted to datetime
    test_set.reset_index(inplace=True, drop=True)
    print("Data preprocessing done. Final output: {} samples.".format((data.shape[0])))

    # Save data
    if not os.path.isdir(folder_name+'/'+ course_name+"/processed"):
        os.makedirs(folder_name+'/'+ course_name+"/processed")
    sparse.save_npz(folder_name+'/'+ course_name+"/processed/q_mat.npz", sparse.csr_matrix(Q_mat))
    data.to_csv(folder_name+'/'+ course_name+"/processed/preprocessed_data.csv", index=False)
    listOfKC = list(listOfKC)
    # Save train-test data
    train_set.to_csv(folder_name+'/'+ course_name+"/processed/train_set.csv", encoding='utf-8', index = False)
    test_set.to_csv(folder_name+'/'+ course_name+"/processed/test_set.csv", encoding='utf-8', index = False)
    
    with open(folder_name+'/'+ course_name+'/processed/dict_of_kc.json', 'w') as fp:
        json.dump(dict1_kc, fp)

    
    return data, Q_mat, listOfKC, dict1_kc, train_set, test_set

In [None]:
# pre_processed_data, q_mat, listOfKC, dict_of_kc, train_set, test_set = prepare_kddcup10('data/kdd', 'bridge_algebra06', \
#                                                                    'bridge_to_algebra_2006_2007_train.txt', \
#                                                                    'bridge_to_algebra_2006_2007_master.txt',\
#                                                                    'KC(SubSkills)', 5, True, False, True)
