In [1]:
import json
import pickle
import keyword
import ast
from datetime import datetime, timedelta
import warnings


import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.cluster import MeanShift


from manage import jsonAttempts2data, jsonExercises2data
from code2aes import Code2Aes
from aes2vec import learnModel, inferVectors, read_corpus, data2cor


  from pandas.core import (


In [2]:
# Data importations
NC1014 = jsonAttempts2data('Datasets/NewCaledonia_1014.json')
NCExercises = jsonExercises2data('Datasets/NewCaledonia_exercises.json')
NC5690 = jsonAttempts2data('Datasets/NewCaledonia_5690.json')

In [3]:
# Dictionary that maps the encoded exercise to its real name
exercice_name_dict = {}
list_exo = list(NCExercises.values())
for exo in list_exo:
    # key : encoded name, value : real name of exercise
    exercice_name_dict[exo["exo_name"]] = exo["funcname"] 
list_exo = list(set(exercice_name_dict.values()))

In [4]:
# Import the attemps' embedding
embedding_data = list(np.load('Datasets/results.npy'))
embedding_data_correction = list(np.load('Datasets/results_corr.npy'))

In [5]:
# Load data trajectories

with open('Datasets\data_visualisation.pkl', 'rb') as fichier:
    data_visualisation = pickle.load(fichier)
trajec_code = data_visualisation[4]
trajec = data_visualisation[0]

In [6]:
def get_source_code(data, encoded = True):
 
    """
    data : list of attemps
    encoded : if exo name are encoded
    Return a dictionary where the keys represent each student.
    For each student, the corresponding value is a dictionary where:
        - The keys are exercises
        - The values are lists of attempts for each exercise
    """
    codes = {}
    for attemps in data:
        user = attemps["user"]
        exercise = attemps["exercise_name"]
        if encoded:
            exercise = exercice_name_dict[exercise]
        if user not in codes:
            codes[user] = {}
        if exercise not in codes[user]:
            codes[user][exercise] = []
        codes[user][exercise].append(attemps["upload"])
    return codes
        

In [7]:
codes = get_source_code(NC5690)

In [8]:
def MeanShift_(data):
    """
    Return : list of labels for each attemp
    """
    mean_shift = MeanShift() 
    clusters = mean_shift.fit_predict(data)
    return clusters

In [9]:
def clustering(trajec, exercise):
    """
    Return : list of labels for each attemp
    """
    embbedings = trajec[exercise]
    cluster = MeanShift_(embbedings)
    return cluster

In [10]:
want_clustering = False # Put at True if you want to do your own clustering
if want_clustering:
    warnings.filterwarnings('ignore')
    cluster_exercise = {}
    used = []
    for i, correction in tqdm(enumerate(NCExercises)):
        exercise = exercice_name_dict[correction]
        if exercise not in used:
            used.append(exercise)
            cluster = clustering(trajec, exercise)
            cluster_exercise[exercise] = cluster
    with open('Datasets/cluster_meanshift.pkl', 'wb') as f:
        pickle.dump(cluster_exercise, f)
    
else:
    with open('Datasets/cluster_meanshift.pkl', 'rb') as f:
        cluster_exercise = pickle.load(f) 

In [11]:
def find_name_node(node):
    """
    Get the name of the node in an AST
    """
    name = node.__class__.__name__.lower()
    if node == "range":
        return node
    else:
        return name

In [12]:
class NodeVisitor(ast.NodeVisitor):
    def __init__(self):
        self.keywords = []
        self.operations = []

    def visit(self, node):
        if isinstance(node, ast.stmt):
            self.keywords.append(node)
            self.operations.append(self.get_operation(node))
        self.generic_visit(node)

    def get_operation(self, node):
        if isinstance(node, ast.If):
            if isinstance(node.test, ast.Compare):
                op = node.test.ops[0]
                return op
        if isinstance(node, ast.For):
            if isinstance(node.iter, ast.Call) and isinstance(node.iter.func, ast.Name) and node.iter.func.id == 'range':
                return "range"
            elif isinstance(node.iter, ast.Str):
                return "str"
            else:
                return node.iter
        elif isinstance(node,ast.While):
            return node.test
        elif isinstance(node, ast.Assign):
            if node.targets:
                target = node.targets[0]
                if hasattr(node, "value"):
                    return node.value
        else:
            return None
def get_ast_keywords_and_operations(code):
    tree = ast.parse(code)
    visitor = NodeVisitor()
    visitor.visit(tree)
    return visitor.keywords, visitor.operations

print("Let's see an example of how our function work : ")
print("\n")
keywords, operations = get_ast_keywords_and_operations(codes["userdId_15"]["nbSyllabes"][-3])

for idx, keyword in enumerate(keywords):
    print(f"Keyword node : {find_name_node(keyword)}")
    print(f"Associated operation : {find_name_node(operations[idx])}")
    print()


Let's see an example of how our function work : 


Keyword node : functiondef
Associated operation : nonetype

Keyword node : if
Associated operation : eq

Keyword node : assign
Associated operation : constant

Keyword node : assign
Associated operation : constant

Keyword node : assign
Associated operation : list

Keyword node : assign
Associated operation : constant

Keyword node : for
Associated operation : range

Keyword node : if
Associated operation : nonetype

Keyword node : assign
Associated operation : binop

Keyword node : assign
Associated operation : subscript

Keyword node : if
Associated operation : in

Keyword node : assign
Associated operation : binop

Keyword node : if
Associated operation : notin

Keyword node : assign
Associated operation : binop

Keyword node : return
Associated operation : nonetype



In [13]:
def get_attemps_by_cluster(trajec_code, labels_emb):
    """
    codes : dictionary
    labels_emb : dictionary
    return : dictionary. Each key corresponds to an exercise.
    Each exercise is a dictionary indicating which cluster it belongs to.
    """
    attemps_by_cluster = {}
    for exercise in labels_emb:
        attemps_by_cluster[exercise] = {}
        attemps = trajec_code[exercise]
        labels = labels_emb[exercise]
        if len(labels) != len(attemps):
            print(len(labels),len(attemps))
        for i, attemp in enumerate(attemps):
            label = labels[i]
            if label not in attemps_by_cluster[exercise]:
                attemps_by_cluster[exercise][label] = []
            attemps_by_cluster[exercise][label].append(attemp)
    return attemps_by_cluster

In [14]:
attemps_by_cluster = get_attemps_by_cluster(trajec_code, cluster_exercise)

In [15]:
def get_keyword_operation(attemps_by_cluster):
    """
    attemps_by_cluster : dictionnary
    Return : dictionnary of dictionnary for each exercise
    Each dictonnary of exercise got as keys the cluster where the attemps belong and
    as values the number of time the pair keyword operation is in the cluster
    """
    keyword_operation = {}
    for exercise in attemps_by_cluster:
        keyword_operation[exercise] = {}
        for cluster in attemps_by_cluster[exercise]:
            if cluster not in keyword_operation[exercise]:
                keyword_operation[exercise][cluster] = {}
            attemps = attemps_by_cluster[exercise][cluster]
            for attemp in attemps:   
                keywords, operations = get_ast_keywords_and_operations(attemp)
                for keyword, operation in zip(keywords,operations):
                    keyword, operation = find_name_node(keyword), find_name_node(operation)
                    if (keyword, operation) not in keyword_operation[exercise][cluster]:
                        keyword_operation[exercise][cluster][(keyword,operation)] = 0
                    keyword_operation[exercise][cluster][(keyword,operation)] += 1
    return keyword_operation

In [16]:
keyword_operation = get_keyword_operation(attemps_by_cluster)



In [17]:
def get_keyword_operation_exercise(keyword_operation):
    """
    return a dictionnary of list of all the pair keyword operation that exist for each exercise
    without consideration of the cluster
    """
    keyword_ope = {}
    for exercise in keyword_operation:
        keyword_ope[exercise] = []
        for cluster in keyword_operation[exercise]:
            keyword_ope[exercise].append(keyword_operation[exercise][cluster])
    return keyword_ope

In [18]:
keyword_ope = get_keyword_operation_exercise(keyword_operation)

In [19]:
def compare_keywords(keywords_operation1, keywords_operation2):
    """
    Methode used to compare two sets of pair keyword operation
    They compare the difference between these two sets
    """
    set_keywords1, set_keywords2 = set(keywords_operation1), set(keywords_operation2.keys())
    set_compare = set_keywords1 - set_keywords2
    return set_compare

In [20]:
def caracterisation_cluster_(keyword_ope):
    """
    - keyword_ope : dictionnary
    Return a dictionnary with exercise as keys
    For each exercise, we got a set of what pair define the cluster
    """
    caracterisation_cluster = {}
    for exercise in keyword_ope:
        caracterisation_cluster[exercise] = []
        for j, key_ope in enumerate(keyword_ope[exercise]):
            unique_key_ope = set(key_ope.keys())
            for i in range(j+1,len(keyword_ope[exercise])):
                set_key_ope = compare_keywords(unique_key_ope, keyword_ope[exercise][i])
            caracterisation_cluster[exercise].append(set_key_ope)
    return caracterisation_cluster

In [21]:
caracterisation_cluster =  caracterisation_cluster_(keyword_ope)

Example for an exericise : "nbSyllabes"

In [22]:
for i, set_clus in enumerate(caracterisation_cluster["nbSyllabes"]):
    print(f"For cluster {i}, the keywords/operations that characterize it the most:")
    print(set_clus)
    print("\n")

For cluster 0, the keywords/operations that characterize it the most:
{('augassign', 'nonetype'), ('for', 'range')}


For cluster 1, the keywords/operations that characterize it the most:
{('assign', 'subscript'), ('assign', 'list'), ('if', 'notin'), ('for', 'range')}


For cluster 2, the keywords/operations that characterize it the most:
{('augassign', 'nonetype'), ('if', 'notin'), ('for', 'range')}


For cluster 3, the keywords/operations that characterize it the most:
{('augassign', 'nonetype'), ('if', 'notin'), ('for', 'range')}


For cluster 4, the keywords/operations that characterize it the most:
{('augassign', 'nonetype'), ('if', 'notin'), ('for', 'range')}




# Caracterisation of small trajectory
For the n-th attemp of an student in a exercise, a small trajectory is define by the change between the n-th and n+1-th attemp

In [23]:
def get_set_keyop(keywords1, operations1, keywords2, operations2):
        key_op1,key_op2 = [], []
        for idx, keyword in enumerate(keywords1):
            keyword = find_name_node(keyword)
            operation = find_name_node(operations1[idx])
            key_op1.append((keyword,operation))
        for idx, keyword in enumerate(keywords2):
            keyword = find_name_node(keyword)
            operation = find_name_node(operations2[idx])
            key_op2.append((keyword,operation))
        key_op1, key_op2 = set(key_op1), set(key_op2)
        return key_op1, key_op2
    
def caracterisation(keywords1, operations1, keywords2, operations2):
        key_op1, key_op2 = get_set_keyop(keywords1, operations1, keywords2, operations2)
        carac_traj_rajout = key_op2 - key_op1
        carac_traj_suppr = key_op1 - key_op2
        carac_traj = {"Add " : carac_traj_rajout, "Delete" : carac_traj_suppr}
        return carac_traj



def caracterise_small_trajectory(student,exercise, codes):
    for i in range(len(codes[student][exercise])-1):
        keywords1, operations1 = get_ast_keywords_and_operations(codes[student][exercise][i])
        keywords2, operations2 = get_ast_keywords_and_operations(codes[student][exercise][i+1])
        print(f"Trajectory number {i}")
        print(caracterisation(keywords1, operations1, keywords2, operations2))
        print()
    

For example, student 15 and exercise nbSyllabes : 

In [24]:
caracterise_small_trajectory("userdId_15","nbSyllabes",codes)

Trajectory number 0
{'Add ': {('if', 'notin')}, 'Delete': set()}

Trajectory number 1
{'Add ': set(), 'Delete': {('if', 'notin')}}

Trajectory number 2
{'Add ': {('if', 'notin')}, 'Delete': {('assign', 'subscript')}}

Trajectory number 3
{'Add ': {('assign', 'subscript')}, 'Delete': {('if', 'notin')}}

Trajectory number 4
{'Add ': {('if', 'notin')}, 'Delete': set()}

Trajectory number 5
{'Add ': set(), 'Delete': set()}

Trajectory number 6
{'Add ': set(), 'Delete': set()}

Trajectory number 7
{'Add ': set(), 'Delete': {('if', 'eq'), ('if', 'notin'), ('assign', 'subscript'), ('if', 'in'), ('assign', 'list')}}

Trajectory number 8
{'Add ': {('if', 'eq'), ('if', 'notin'), ('assign', 'subscript'), ('if', 'in'), ('assign', 'list')}, 'Delete': set()}

Trajectory number 9
{'Add ': set(), 'Delete': set()}

Trajectory number 10
{'Add ': set(), 'Delete': {('assign', 'subscript'), ('if', 'eq'), ('if', 'in'), ('assign', 'list')}}

Trajectory number 11
{'Add ': set(), 'Delete': set()}

