python-SyML · KillianVar · Jan 1, 2025 · Jan 1, 2025 · Jan 1, 2025 · Jan 1, 2025
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.7.2
+current_version = 0.7.4
 commit = True
 tag = True
 

diff --git a/.cookiecutterrc b/.cookiecutterrc
@@ -54,7 +54,7 @@ default_context:
     sphinx_theme: "furo"
     test_matrix_separate_coverage: "no"
     tests_inside_package: "no"
-    version: "0.7.2"
+    version: "0.7.4"
     version_manager: "bump2version"
     website: ""
     year_from: "2024"

diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -127,11 +127,3 @@ jobs:
       with:
         project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
         coverage-reports: coverage.xml
-  finish:
-    needs: lib_test
-    if: ${{ always() }}
-    runs-on: ubuntu-latest
-    steps:
-    - uses: coverallsapp/github-action@v2
-      with:
-        parallel-finished: true
diff --git a/README.rst b/README.rst
@@ -1,10 +1,6 @@
-|SyML-Logo|
-
-.. |SyML-Logo| image:: https://github.com/KillianVar/python-syml/raw/dev/docs/source/img/banner.png
-
-========
-Overview
-========
+=========================
+Repo Health Information
+=========================
 
 .. start-badges
 
@@ -25,11 +21,11 @@ Overview
     :alt: GitHub Actions Build Status
     :target: https://github.com/KillianVar/python-syml/actions
 
-.. |coverage| image:: https://app.codacy.com/project/badge/Coverage/8d0cc71c89524ea1ab77e9724ab74df9
+.. |coverage| image:: https://app.codacy.com/project/badge/Coverage/011ba7fa745b4a3083ea714909699110
     :alt: Coverage Build Status
     :target: https://app.codacy.com/gh/KillianVar/python-syml/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage
 
-.. |codacy| image:: https://img.shields.io/codacy/grade/c67d6aeb590745b5832fd8e5d3d7717c.svg
+.. |codacy| image:: https://app.codacy.com/project/badge/Grade/011ba7fa745b4a3083ea714909699110
     :target: https://app.codacy.com/gh/KillianVar/python-syml/dashboard
     :alt: Codacy Code Quality Status
 
@@ -49,14 +45,20 @@ Overview
     :alt: Supported implementations
     :target: https://pypi.org/project/syml
 
-.. |commits-since| image:: https://img.shields.io/github/commits-since/KillianVar/python-syml/v0.7.2.svg
+.. |commits-since| image:: https://img.shields.io/github/commits-since/KillianVar/python-syml/v0.7.4.svg
     :alt: Commits since latest release
-    :target: https://github.com/KillianVar/python-syml/compare/v0.7.2...main
+    :target: https://github.com/KillianVar/python-syml/compare/v0.7.4...main
 
 
 
 .. end-badges
 
+|SyML-Logo|
+
+.. |SyML-Logo| image:: https://github.com/KillianVar/python-syml/raw/dev/docs/source/img/banner.png
+
+
+
 SyML (Systematic Machine Learning) is a library built to make Machine Learning simpler, by using SOTA ML, xAI and
 vizualisation methods.
 

diff --git a/ci/templates/.github/workflows/github-actions.yml b/ci/templates/.github/workflows/github-actions.yml
@@ -62,12 +62,4 @@ jobs:
       env:
         TOXPYTHON: '{{ '${{ matrix.toxpython }}' }}'
       run: >
-        tox -e {{ '${{ matrix.tox_env }}' }} -v
-  finish:
-    needs: test
-    if: {{ '${{ always() }}' }}
-    runs-on: ubuntu-latest
-    steps:
-    - uses: coverallsapp/github-action@v2
-      with:
-        parallel-finished: true
+        tox -e {{ '${{ matrix.tox_env }}' }} -v
diff --git a/docs/conf.py b/docs/conf.py
@@ -15,7 +15,7 @@
 year = "2024"
 author = "Killian Varescon"
 copyright = f"{year}, {author}"
-version = release = "0.7.2"
+version = release = "0.7.4"
 
 pygments_style = "trac"
 templates_path = ["."]

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 readme = "README.rst"
 license = {file = "LICENSE"}
 description = "SyML (Systematic Machine Learning) is a library built to make Machine Learning simpler, by using SOTA ML, xAI and vizualisation methods."
-version = "0.7.2"
+version = "0.7.4"
 requires-python = ">=3.8"
 classifiers = [
     # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers

diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@ def read(*names, **kwargs):
         re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub("", read("README.rst")),
         re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CHANGELOG.rst")),
     ),
-    url="https://github.com/KillianVar/python-syml",
+    url="https://github.com/python-SyML/python-syml",
     packages=find_packages("src"),
     package_dir={"": "src"},
     py_modules=[path.stem for path in Path("src").glob("*.py")],

diff --git a/src/syml/__init__.py b/src/syml/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.7.2"
+__version__ = "0.7.4"
 
 from syml.interract.discovery_dashboard.dashboard import Dashboard
 

diff --git a/src/syml/diagnostool/semantic/clustering.py b/src/syml/diagnostool/semantic/clustering.py
@@ -0,0 +1,21 @@
+from sklearn.cluster import AgglomerativeClustering
+
+
+class Clustering:
+    def __init__(self, embeddings, labels):
+        self.embeddings = embeddings
+        self.labels = labels
+
+    def init_clustering(self, n_clusters=None, distance_threshold=1.0):
+        self.clustering_model = AgglomerativeClustering(n_clusters=n_clusters, distance_threshold=distance_threshold)
+
+    def find_cluster(self):
+        self.clustering_model.fit(self.embeddings)
+        self.labels_cluster = self.clustering_model.labels_
+
+        clustered_sentences = {}
+        for sentence_id, cluster_id in enumerate(self.labels_cluster):
+            if cluster_id not in clustered_sentences:
+                clustered_sentences[f"cluster {cluster_id}"] = []
+
+            clustered_sentences[f"cluster {cluster_id}"].append(self.labels[sentence_id])
diff --git a/src/syml/diagnostool/semantic/label_clustering.py b/src/syml/diagnostool/semantic/label_clustering.py
diff --git a/src/syml/diagnostool/semantic/label_embedding.py b/src/syml/diagnostool/semantic/label_embedding.py
@@ -0,0 +1,59 @@
+from pathlib import Path
+
+import numpy as np
+import torch as pt
+from sentence_transformers import SentenceTransformer
+
+from .clustering import Clustering
+from .utils import get_device
+
+
+class LabelEmbedder:
+    def __init__(
+        self,
+        labels=None,
+        embedding_model="all-MiniLM-L6-v2",
+        path="",
+        field_name="",
+    ):
+        self._labels = labels
+        self.field_name = field_name
+        self.path = path
+        self.device = get_device()
+        self.embedding_model = self.init_model(embedding_model, device=self.device)
+        self.embeddings = self.embed(labels=self.labels)
+        self._similarities = self.embedding_model.similarity(self.embeddings, self.embeddings)
+        self.clustering = Clustering(self.embeddings, self.labels)
+
+    def init_model(self, embedding_model, device):
+        return SentenceTransformer(embedding_model, device=device)
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def similarities(self):
+        return self._similarities
+
+    @labels.setter
+    def labels(self, labels):
+        self._labels = labels
+        self.embeddings = self.embed(labels=self.labels)
+        self._similarities = self.embedding_model.similarity(self.embeddings, self.embeddings)
+        self.clustering = Clustering(self.embeddings, self.labels)
+
+    def embed(self, labels=None):
+        path = Path(self.path.format(field_name=self.field_name))
+        if labels is None or np.all(labels == self.labels):
+            labels = self.labels
+        if (not hasattr(self, "embeddings")) or (self.embeddings is None) or np.any(labels != self.labels):
+            if path.exists():
+                embeds = pt.load(path)
+                if embeds.shape[0] == len(labels):
+                    return embeds
+            embeds = self.embedding_model.encode(labels)
+            pt.save(embeds, self.path.format(field_name=self.field_name))
+            return embeds
+        else:
+            return self.embeddings
diff --git a/src/syml/diagnostool/semantic/semantIA.py b/src/syml/diagnostool/semantic/semantIA.py
@@ -0,0 +1,20 @@
+from .label_embedding import LabelEmbedder
+from .vizualization import plot_similarities
+from .vizualization import plot_umap
+
+
+class SementIA:
+    def __init__(
+        self,
+        labels,
+        path="",
+        field_name="",
+    ):
+        self.labels = labels
+        self.embedder = LabelEmbedder(labels=self.labels, field_name=field_name, path=path)
+
+    def scatter_labels(self):
+        return plot_umap(self.embedder.embeddings, self.embedder.labels)
+
+    def heatmap_similiarities(self):
+        return plot_similarities(self.embedder.similarities, self.embedder.labels)
diff --git a/src/syml/diagnostool/semantic/vizualization.py b/src/syml/diagnostool/semantic/vizualization.py
@@ -0,0 +1,46 @@
+import pandas as pd
+import plotly.express as px
+import umap
+
+
+def plot_umap(embeddings, labels, n_components=2, n_neighbors=2, min_dist=0.1, metric="euclidean"):
+    """
+    Apply UMAP to a list of strings embedded using SpaCy and plot the result using Plotly.
+
+    Parameters:
+    strings (list of str): List of strings to be embedded.
+    n_components (int): Number of dimensions to reduce to (2 or 3).
+    n_neighbors (int): The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
+    min_dist (float): The effective minimum distance between embedded points.
+    random_state (int): Determines the random number generation for initialization.
+    """
+
+    # if labels is None:
+    #     labels = self.labels
+    # else:
+    #     self.labels = labels
+
+    # embeddings = self.embeddings
+
+    # Apply UMAP
+    reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
+    reduced_data = reducer.fit_transform(embeddings)
+
+    # Create a DataFrame for plotting
+    if n_components == 2:
+        df = pd.DataFrame(reduced_data, columns=["UMAP1", "UMAP2"])
+        df["Label"] = labels
+        fig = px.scatter(df, x="UMAP1", y="UMAP2", text="Label", title="UMAP 2D Plot")
+    elif n_components == 3:
+        df = pd.DataFrame(reduced_data, columns=["UMAP1", "UMAP2", "UMAP3"])
+        df["Label"] = labels
+        fig = px.scatter_3d(df, x="UMAP1", y="UMAP2", z="UMAP3", text="Label", title="UMAP 3D Plot")
+    else:
+        raise ValueError("n_components must be 2 or 3")
+
+    return fig
+
+
+def plot_similarities(similarities, labels, colors="RdBu_r"):
+    fig = px.imshow(similarities, x=labels, y=labels, color_continuous_scale=colors)
+    return fig
diff --git a/src/syml/interract/quality_improvement/categorical_components/label_grouping.py b/src/syml/interract/quality_improvement/categorical_components/label_grouping.py
@@ -1,6 +1,6 @@
 import streamlit as st
 
-from syml.diagnostool.semantic.label_clustering import LabelAnalysis
+from syml.diagnostool.semantic.semantIA import SementIA
 from syml.diagnostool.semantic.utils import df_typo
 from syml.interract.page_class import BasePageElement
 
@@ -47,6 +47,9 @@ def analysis(self):
             n_typos = st.slider("Number of typos", 1, 20, step=1, value=5)
             data = df_typo(data, n_typos=n_typos)
 
-        label_analysis = LabelAnalysis(labels=data, path="../python-syml/data/embeddings_{field_name}.pt", field_name=to_inspect)
-        fig = label_analysis.plot_umap()
+        label_analysis = SementIA(labels=data, path="../python-syml/data/embeddings_{field_name}.pt", field_name=to_inspect)
+        fig = label_analysis.scatter_labels()
+        st.plotly_chart(fig)
+
+        fig = label_analysis.heatmap_similiarities()
         st.plotly_chart(fig)
diff --git a/tox.ini b/tox.ini
@@ -25,7 +25,7 @@ basepython =
     py310: {env:TOXPYTHON:python3.10}
     py311: {env:TOXPYTHON:python3.11}
     py312: {env:TOXPYTHON:python3.12}
-    {bootstrap,clean,check,report,docs,coveralls}: {env:TOXPYTHON:python3}
+    {bootstrap,clean,check,report,docs}: {env:TOXPYTHON:python3}
 setenv =
     PYTHONPATH={toxinidir}/tests
     PYTHONUNBUFFERED=yes