In [1]:
import pickle
import warnings
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap
import pacmap
import Levenshtein

from manage import jsonAttempts2data, jsonExercises2data
from code2aes import Code2Aes
from aes2vec import learnModel, inferVectors, read_corpus, data2cor


  from pandas.core import (





In this notebook, we will compare the impact of PCA, t-SNE, and PaCMAP on dimensionality reduction.
To do this, we will compute the similarity between all points in the original embedding, then compute the similarity in the reduced embeddings and calculate the difference. A lower score means a better representation

First, let's import some data (trajectories...)

In [2]:
# Importations des données
NC1014 = jsonAttempts2data('Datasets/NewCaledonia_1014.json')
NCExercises = jsonExercises2data('Datasets/NewCaledonia_exercises.json')
NC5690 = jsonAttempts2data('Datasets/NewCaledonia_5690.json')

In [3]:
# Load data trajectories

with open('Datasets\data_visualisation.pkl', 'rb') as fichier:
    data_visualisation = pickle.load(fichier)
trajec_emb = data_visualisation[0]

{'semestreValide': ['def semestreValide(ue1,ue2):\n    if ue1>10:\n      if ue2<10:\n        if (ue1+ue2)/2>10:\n          res=True\n        else:\n          res=False\n      else:\n        res=True\n    else:\n      res=False\n    return res',
  'def semestreValide(ue1,ue2):\n    if ue1>=10:\n      if ue2<10:\n        if (ue1+ue2)/2>=10:\n          res=True\n        else:\n          res=False\n      else:\n        res=True\n    else:\n      res=False\n    return res',
  'def semestreValide(ue1,ue2):\n    res = False\n    if ue1 >= 10 and ue2 >= 10:\n        res = True\n    elif (ue1 > 10 and ue2 < 10) and (ue1+ue2)/2 >= 10:\n        res = True\n    \n\n    return res',
  'def semestreValide(ue1,ue2):\n  if ue1>10 :\n    a=ue1+ue2\n    if a>=20 :\n      res=True\n  else :\n    res=False\n  return res',
  'def semestreValide(ue1,ue2):\n  if ue1>=10 :\n    a=ue1+ue2\n    if a>=20 :\n      res=True\n  else :\n    res=False\n  return res',
  'def semestreValide(ue1,ue2):\n  if ue1 < 10 :\n

In [4]:
# similarity measure 
def cos2(x,y):
    """
    Similarity measure
    Return : float between 0 and 1
    A value of 1 indicates that x and y are similar, while a value of 0 indicates that they are not.
    """
    prod = np.dot(x,y)
    norm1 =  np.linalg.norm(x)
    norm2 =  np.linalg.norm(y)
    cos2 = prod / (norm1 * norm2)
    return cos2 ** 2

In [5]:
def distance_embedding(trajec):
    """
    Compute the similarity of every point with every other point.
    Return: Dictionary where keys are exercises, and values are lists of lists of similarities.
    The i-th list contains the similarity of point i with every other point
    """
    score_emb = {}
    for exercise in trajec:
        score_emb[exercise] = []
        embeddings = trajec[exercise]
        for i in range(len(embeddings)-1):
            embbeding_compared = embeddings[i]
            compare = []
            for j in range(i+1,len(embeddings)):
                embbeding_compare = embeddings[j]
                score = cos2(embbeding_compared,embbeding_compare)
                compare.append(score)
            score_emb[exercise].append(compare)
    return score_emb

Let's get the reduced dimension of embedding with t-sne, pca and pacmap

In [6]:
def get_reduced_data(trajec_emb):
    warnings.filterwarnings('ignore')
    trajectory_reduced = {"t_sne" : {}, "PCA" : {}, "pacmap" : {}}
    for exo in tqdm(trajec_emb.keys()):
        list_emb = trajec_emb[exo]
        data_array = np.array(list_emb)
        n_samples = data_array.shape[0]
        perplexity = min(n_samples - 1, 30)
        # Reduced data with TSNE
        embedding_TSNE = TSNE(n_components=2, perplexity=perplexity, random_state=42)
        X_TSNE = embedding_TSNE.fit_transform(data_array)
        trajectory_reduced["t_sne"][exo] = X_TSNE
        # Reduced data with PCA
        embedding_PCA = PCA(n_components=2, random_state=42)
        X_PCA = embedding_PCA.fit_transform(data_array)
        trajectory_reduced["PCA"][exo] = X_PCA
        # Reduced data with pacmap
        embedding_pacmap = pacmap.PaCMAP(n_components=2, n_neighbors=perplexity, random_state=42)
        X_pacmap = embedding_pacmap.fit_transform(data_array)
        trajectory_reduced["pacmap"][exo] = X_pacmap
    return trajectory_reduced

In [7]:
def score(trajectory_reduced):
    """
    Compute the score (cos²) between every points for each algo
    """
    score_algo = {}
    for algo in tqdm(trajectory_reduced):
        trajec = trajectory_reduced[algo]
        score_emb = distance_embedding(trajec)
        score_algo[algo] = score_emb
    return score_algo

In [8]:
def compare_visu(score_method, score_embbeding):
    score_compare = {}
    for algo in score_method:
        score_compare[algo] = {}
        for exercise in score_embbeding:
            score_emb = score_embbeding[exercise]
            score_algo = score_method[algo][exercise]
            score = 0
            for i in range(len(score_emb)):
                score_array = np.array(score_emb[i])
                score_algo_array = np.array(score_algo[i])
                score += sum(abs(score_array-score_algo_array))
            score_compare[algo][exercise] = score
    return score_compare

In [9]:
score_emb = distance_embedding(trajec_emb)
trajectory_reduced = get_reduced_data(trajec_emb)
score_algo = score(trajectory_reduced)
score_compare = compare_visu(score_algo, score_emb)

  0%|          | 0/56 [00:00<?, ?it/s]

  File "C:\Users\Stagiaire\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
for algo in score_compare:
    score = score_compare[algo]
    mean = np.mean(list(score.values()))
    print(f"{algo} got a mean score of {mean}")

t_sne got a mean score of 3299.2605556479734
PCA got a mean score of 3130.7491527263014
pacmap got a mean score of 3458.157161818885


Let's compare t-sne and PCA using the source code

In [11]:
def euclidean_distance(x, y):
    return np.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2)

In [12]:
def closest_point(trajectory_reduced):

    closest_point_pca = {}
    closest_point_tsne = {}

    for exo in tqdm(trajectory_reduced["PCA"]):
        if exo not in closest_point_pca:
            closest_point_tsne[exo] = {}
            closest_point_pca[exo] = {}
        coord_tsne = trajectory_reduced["t_sne"][exo]
        coord_pca = trajectory_reduced["PCA"][exo]
        for i, coord_t in enumerate(coord_tsne):
            coord_p = coord_pca[i]
            if i not in closest_point_pca[exo]:
                closest_point_pca[exo][i] = []
                closest_point_tsne[exo][i] = []
            for j in range(len(coord_tsne)):
                    coord_pca_compare = coord_pca[j]
                    coord_tsne_compare = coord_tsne[j]
                    dist_pca = euclidean_distance(coord_p,coord_pca_compare)
                    dist_tsne = euclidean_distance(coord_t,coord_tsne_compare)
                    closest_point_pca[exo][i].append(dist_pca)
                    closest_point_tsne[exo][i].append(dist_tsne)
    return closest_point_pca, closest_point_tsne

In [13]:
def closet_point_lev(trajectory_reduced, data_visualisation):
    source_code = data_visualisation[4]
    closest_point_lev = {}
    for exo in tqdm(trajectory_reduced["PCA"]):
        if exo not in closest_point_lev:
            closest_point_lev[exo] = {}
        codes = source_code[exo]
        for i, code in enumerate(codes):
            if i not in closest_point_lev[exo]:
                closest_point_lev[exo][i] = []
                for j in range(len(codes)):
                    code_compare = codes[j]
                    dis_lev = Levenshtein.distance(code, code_compare)
                    closest_point_lev[exo][i].append(dis_lev)
    return closest_point_lev

In [14]:
def define_closest_point(closest_point):
    closest_point_index = {}
    for exo in closest_point:
        closest_point_index[exo] = {}
        for index_tentative in closest_point[exo]:
            list_distance = closest_point[exo][index_tentative]
            closet_point = [i for i in range(len(list_distance))]
            paired_lists = list(zip(list_distance, closet_point))
            paired_lists_sorted = sorted(paired_lists, key=lambda x: x[0])
            sorted_distances, sorted_closet_points = zip(*paired_lists_sorted)
            sorted_distances = list(sorted_distances)
            sorted_closet_points = list(sorted_closet_points)
            closest_point_index[exo][index_tentative] = sorted_closet_points
    return closest_point_index

In [15]:
def count_permutations(reference_list, target_list):
    index_map = {value: idx for idx, value in enumerate(reference_list)}
    
    visited = [False] * len(target_list)
    swaps = 0
    
    for i in range(len(target_list)):
        if visited[i] or index_map[target_list[i]] == i:
            # If already visited or in the correct position, skip
            continue
        
        # Start of a new cycle
        cycle_size = 0
        j = i
        
        while not visited[j]:
            visited[j] = True
            j = index_map[target_list[j]]
            cycle_size += 1
        
        # Each cycle of size n requires n-1 swaps
        if cycle_size > 0:
            swaps += (cycle_size - 1)
    
    return swaps

In [16]:
def permuation_algo(closest_point_pca, closest_point_tsne, closest_point_lev):
    permu_pca = {}
    permu_tsne = {}
    for exo in closest_point_lev:
        permu_tsne[exo] = 0
        permu_pca[exo] = 0
        for i in closest_point_pca[exo]:
            reference_list = closest_point_lev[exo][i]
            list_tsne = closest_point_tsne[exo][i]
            list_acp = closest_point_pca[exo][i]
            num_perm_pca = count_permutations(reference_list, list_acp)
            num_perm_tsne = count_permutations(reference_list, list_tsne)
            permu_tsne[exo] += num_perm_tsne
            permu_pca[exo] += num_perm_pca
    return permu_tsne, permu_pca

In [17]:
closest_point_pca, closest_point_tsne = closest_point(trajectory_reduced)
closest_point_lev = closet_point_lev(trajectory_reduced, data_visualisation)

closest_point_pca = define_closest_point(closest_point_pca)
closest_point_tsne = define_closest_point(closest_point_tsne)
closest_point_lev = define_closest_point(closest_point_lev)


permu_tsne, permu_pca = permuation_algo(closest_point_pca, closest_point_tsne, closest_point_lev)

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

In [18]:
mean_value_t = np.mean(list(permu_tsne.values()))
mean_value_a = np.mean(list(permu_pca.values()))
print(f"On average, we got {mean_value_t} permutations for tsne agaisnt {mean_value_a} for pca")

On average, we got 15949.05357142857 permutations for tsne agaisnt 15974.25 for pca
