Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.7.2
current_version = 0.7.4
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion .cookiecutterrc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ default_context:
sphinx_theme: "furo"
test_matrix_separate_coverage: "no"
tests_inside_package: "no"
version: "0.7.2"
version: "0.7.4"
version_manager: "bump2version"
website: ""
year_from: "2024"
Expand Down
8 changes: 0 additions & 8 deletions .github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,3 @@ jobs:
with:
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
coverage-reports: coverage.xml
finish:
needs: lib_test
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
- uses: coverallsapp/github-action@v2
with:
parallel-finished: true
24 changes: 13 additions & 11 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
|SyML-Logo|

.. |SyML-Logo| image:: https://github.com/KillianVar/python-syml/raw/dev/docs/source/img/banner.png

========
Overview
========
=========================
Repo Health Information
=========================

.. start-badges

Expand All @@ -25,11 +21,11 @@ Overview
:alt: GitHub Actions Build Status
:target: https://github.com/KillianVar/python-syml/actions

.. |coverage| image:: https://app.codacy.com/project/badge/Coverage/8d0cc71c89524ea1ab77e9724ab74df9
.. |coverage| image:: https://app.codacy.com/project/badge/Coverage/011ba7fa745b4a3083ea714909699110
:alt: Coverage Build Status
:target: https://app.codacy.com/gh/KillianVar/python-syml/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_coverage

.. |codacy| image:: https://img.shields.io/codacy/grade/c67d6aeb590745b5832fd8e5d3d7717c.svg
.. |codacy| image:: https://app.codacy.com/project/badge/Grade/011ba7fa745b4a3083ea714909699110
:target: https://app.codacy.com/gh/KillianVar/python-syml/dashboard
:alt: Codacy Code Quality Status

Expand All @@ -49,14 +45,20 @@ Overview
:alt: Supported implementations
:target: https://pypi.org/project/syml

.. |commits-since| image:: https://img.shields.io/github/commits-since/KillianVar/python-syml/v0.7.2.svg
.. |commits-since| image:: https://img.shields.io/github/commits-since/KillianVar/python-syml/v0.7.4.svg
:alt: Commits since latest release
:target: https://github.com/KillianVar/python-syml/compare/v0.7.2...main
:target: https://github.com/KillianVar/python-syml/compare/v0.7.4...main



.. end-badges

|SyML-Logo|

.. |SyML-Logo| image:: https://github.com/KillianVar/python-syml/raw/dev/docs/source/img/banner.png



SyML (Systematic Machine Learning) is a library built to make Machine Learning simpler, by using SOTA ML, xAI and
vizualisation methods.

Expand Down
10 changes: 1 addition & 9 deletions ci/templates/.github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,4 @@ jobs:
env:
TOXPYTHON: '{{ '${{ matrix.toxpython }}' }}'
run: >
tox -e {{ '${{ matrix.tox_env }}' }} -v
finish:
needs: test
if: {{ '${{ always() }}' }}
runs-on: ubuntu-latest
steps:
- uses: coverallsapp/github-action@v2
with:
parallel-finished: true
tox -e {{ '${{ matrix.tox_env }}' }} -v
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
year = "2024"
author = "Killian Varescon"
copyright = f"{year}, {author}"
version = release = "0.7.2"
version = release = "0.7.4"

pygments_style = "trac"
templates_path = ["."]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors = [
readme = "README.rst"
license = {file = "LICENSE"}
description = "SyML (Systematic Machine Learning) is a library built to make Machine Learning simpler, by using SOTA ML, xAI and vizualisation methods."
version = "0.7.2"
version = "0.7.4"
requires-python = ">=3.8"
classifiers = [
# complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def read(*names, **kwargs):
re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub("", read("README.rst")),
re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CHANGELOG.rst")),
),
url="https://github.com/KillianVar/python-syml",
url="https://github.com/python-SyML/python-syml",
packages=find_packages("src"),
package_dir={"": "src"},
py_modules=[path.stem for path in Path("src").glob("*.py")],
Expand Down
2 changes: 1 addition & 1 deletion src/syml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.7.2"
__version__ = "0.7.4"

from syml.interract.discovery_dashboard.dashboard import Dashboard

Expand Down
21 changes: 21 additions & 0 deletions src/syml/diagnostool/semantic/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from sklearn.cluster import AgglomerativeClustering


class Clustering:
def __init__(self, embeddings, labels):
self.embeddings = embeddings
self.labels = labels

def init_clustering(self, n_clusters=None, distance_threshold=1.0):
self.clustering_model = AgglomerativeClustering(n_clusters=n_clusters, distance_threshold=distance_threshold)

def find_cluster(self):
self.clustering_model.fit(self.embeddings)
self.labels_cluster = self.clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(self.labels_cluster):
if cluster_id not in clustered_sentences:
clustered_sentences[f"cluster {cluster_id}"] = []

clustered_sentences[f"cluster {cluster_id}"].append(self.labels[sentence_id])
90 changes: 0 additions & 90 deletions src/syml/diagnostool/semantic/label_clustering.py

This file was deleted.

59 changes: 59 additions & 0 deletions src/syml/diagnostool/semantic/label_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from pathlib import Path

import numpy as np
import torch as pt
from sentence_transformers import SentenceTransformer

from .clustering import Clustering
from .utils import get_device


class LabelEmbedder:
def __init__(
self,
labels=None,
embedding_model="all-MiniLM-L6-v2",
path="",
field_name="",
):
self._labels = labels
self.field_name = field_name
self.path = path
self.device = get_device()
self.embedding_model = self.init_model(embedding_model, device=self.device)
self.embeddings = self.embed(labels=self.labels)
self._similarities = self.embedding_model.similarity(self.embeddings, self.embeddings)
self.clustering = Clustering(self.embeddings, self.labels)

def init_model(self, embedding_model, device):
return SentenceTransformer(embedding_model, device=device)

@property
def labels(self):
return self._labels

@property
def similarities(self):
return self._similarities

@labels.setter
def labels(self, labels):
self._labels = labels
self.embeddings = self.embed(labels=self.labels)
self._similarities = self.embedding_model.similarity(self.embeddings, self.embeddings)
self.clustering = Clustering(self.embeddings, self.labels)

def embed(self, labels=None):
path = Path(self.path.format(field_name=self.field_name))
if labels is None or np.all(labels == self.labels):
labels = self.labels
if (not hasattr(self, "embeddings")) or (self.embeddings is None) or np.any(labels != self.labels):
if path.exists():
embeds = pt.load(path)
if embeds.shape[0] == len(labels):
return embeds
embeds = self.embedding_model.encode(labels)
pt.save(embeds, self.path.format(field_name=self.field_name))
return embeds
else:
return self.embeddings
20 changes: 20 additions & 0 deletions src/syml/diagnostool/semantic/semantIA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .label_embedding import LabelEmbedder
from .vizualization import plot_similarities
from .vizualization import plot_umap


class SementIA:
def __init__(
self,
labels,
path="",
field_name="",
):
self.labels = labels
self.embedder = LabelEmbedder(labels=self.labels, field_name=field_name, path=path)

def scatter_labels(self):
return plot_umap(self.embedder.embeddings, self.embedder.labels)

def heatmap_similiarities(self):
return plot_similarities(self.embedder.similarities, self.embedder.labels)
46 changes: 46 additions & 0 deletions src/syml/diagnostool/semantic/vizualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd
import plotly.express as px
import umap


def plot_umap(embeddings, labels, n_components=2, n_neighbors=2, min_dist=0.1, metric="euclidean"):
"""
Apply UMAP to a list of strings embedded using SpaCy and plot the result using Plotly.

Parameters:
strings (list of str): List of strings to be embedded.
n_components (int): Number of dimensions to reduce to (2 or 3).
n_neighbors (int): The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
min_dist (float): The effective minimum distance between embedded points.
random_state (int): Determines the random number generation for initialization.
"""

# if labels is None:
# labels = self.labels
# else:
# self.labels = labels

# embeddings = self.embeddings

# Apply UMAP
reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
reduced_data = reducer.fit_transform(embeddings)

# Create a DataFrame for plotting
if n_components == 2:
df = pd.DataFrame(reduced_data, columns=["UMAP1", "UMAP2"])
df["Label"] = labels
fig = px.scatter(df, x="UMAP1", y="UMAP2", text="Label", title="UMAP 2D Plot")
elif n_components == 3:
df = pd.DataFrame(reduced_data, columns=["UMAP1", "UMAP2", "UMAP3"])
df["Label"] = labels
fig = px.scatter_3d(df, x="UMAP1", y="UMAP2", z="UMAP3", text="Label", title="UMAP 3D Plot")
else:
raise ValueError("n_components must be 2 or 3")

return fig


def plot_similarities(similarities, labels, colors="RdBu_r"):
fig = px.imshow(similarities, x=labels, y=labels, color_continuous_scale=colors)
return fig
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import streamlit as st

from syml.diagnostool.semantic.label_clustering import LabelAnalysis
from syml.diagnostool.semantic.semantIA import SementIA
from syml.diagnostool.semantic.utils import df_typo
from syml.interract.page_class import BasePageElement

Expand Down Expand Up @@ -47,6 +47,9 @@ def analysis(self):
n_typos = st.slider("Number of typos", 1, 20, step=1, value=5)
data = df_typo(data, n_typos=n_typos)

label_analysis = LabelAnalysis(labels=data, path="../python-syml/data/embeddings_{field_name}.pt", field_name=to_inspect)
fig = label_analysis.plot_umap()
label_analysis = SementIA(labels=data, path="../python-syml/data/embeddings_{field_name}.pt", field_name=to_inspect)
fig = label_analysis.scatter_labels()
st.plotly_chart(fig)

fig = label_analysis.heatmap_similiarities()
st.plotly_chart(fig)
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ basepython =
py310: {env:TOXPYTHON:python3.10}
py311: {env:TOXPYTHON:python3.11}
py312: {env:TOXPYTHON:python3.12}
{bootstrap,clean,check,report,docs,coveralls}: {env:TOXPYTHON:python3}
{bootstrap,clean,check,report,docs}: {env:TOXPYTHON:python3}
setenv =
PYTHONPATH={toxinidir}/tests
PYTHONUNBUFFERED=yes
Expand Down