Loading the data file and Combining the data into paragrapgh by removing the shorter and noise in the text

In [6]:
import re
import pandas as pd

# List of common filler words
filler_words = {"ja", "hm", "mhm", "ach", "gut", "und", "eben", "ne", "ok", "aha", "ach so", "nicht", "ja", "nein", "wieder", "schon", "naja", "wieso", "wieso nicht", "wieso nicht"}

# Filtering criteria
min_words = 2
min_unique_words = 2
min_characters = 5

def has_repeated_filler_patterns(sentence, filler_words):
    """
    Check if any filler word appears repeatedly either consecutively
    or separated by commas or spaces in the sentence.
    """
    pattern = r'\b(' + '|'.join(re.escape(word) for word in filler_words) + r')\b(?:[\s,]+)+\1\b'
    
    if re.search(pattern, sentence):
        return True
    return False

def filter_short_and_filler_sentences(text, filler_words, min_words, min_unique_words, min_characters):
    """
    Filter sentences from the text based on length, unique words,
    minimum characters, and absence of repeated filler patterns.
    """
    sentences = text.split('. ')
    filtered_sentences = []

    for sentence in sentences:
        words = sentence.split()
        
        if len(words) >= min_words and len(set(words)) >= min_unique_words and len(sentence) >= min_characters:
            if not has_repeated_filler_patterns(sentence, filler_words):
                filtered_sentences.append(sentence)
    
    return '. '.join(filtered_sentences)

# File path
file_path = 'Interviews_Dataset.xlsx'

# Loading the Excel file
excel_data = pd.read_excel(file_path, sheet_name=None)

# Combining the sentences to paragraphs and applying the filtering
combined_paragraphs = {}

for sheet_name, df in excel_data.items():
    # Drop the 'Timecode' and 'Sprecher' columns
    df = df.drop(columns=['Timecode', 'Sprecher'])
    
    # Convert the 'Transkript' column to a single string
    df['Transkript'] = df['Transkript'].astype(str)
    paragraph = ' '.join(df['Transkript'].tolist())
    
    # Filter short sentences and apply filler pattern filtering
    filtered_paragraph = filter_short_and_filler_sentences(paragraph, filler_words, min_words, min_unique_words, min_characters)
    
    # Save the filtered paragraph by sheet name in the dictionary
    combined_paragraphs[sheet_name] = filtered_paragraph

# The `combined_paragraphs` dictionary now contains the filtered paragraphs for each sheet in memory.
# You can now use `combined_paragraphs` as needed in your code.

print("Filtered paragraphs have been processed and stored in memory.")


Filtered paragraphs have been processed and stored in memory.


Processing the preprocessed data into embedding model to genarate the embeddings 

In [None]:
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer

# Load the model Jina Embeddings
model_name = 'jinaai/jina-embeddings-v2-base-de'
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Concatenate all filtered paragraphs into one large text string
combined_text = ' '.join(combined_paragraphs.values())

# Split the concatenated text into sentences
sentences = combined_text.split('. ')

batch_size = 32  
embeddings = []

# Process sentences in batches and compute embeddings
for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i+batch_size]
    inputs = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs).last_hidden_state
        batch_embeddings = outputs.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)

# Convert the embeddings list to a NumPy array
embeddings = np.array(embeddings)

print(f"Generated {len(embeddings)} embeddings.")


Reducing the dimension of the embedded data for the clustering

In [10]:
import umap.umap_ as umap

umap_model = umap.UMAP(n_neighbors=20, n_components=10, min_dist=0.1, metric='cosine', low_memory=True)
umap_embeddings = umap_model.fit_transform(embeddings)

Clustering the embedded data and saving in a json file

In [12]:
import json
import numpy as np
import hdbscan

# Create a HDBSCAN Clustering instance with specified hyperparameters
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, cluster_selection_epsilon=0.4, metric='euclidean', cluster_selection_method='eom')
clusters = clusterer.fit_predict(umap_embeddings)

num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
print(f"Number of clusters: {num_clusters}")
# Create a list to hold clusters with their sentences
clusters_list = []

for cluster in np.unique(clusters):
    # Convert the cluster ID to a regular Python int
    cluster_id = int(cluster)
    # Collect sentences for each cluster
    sentences_in_cluster = [sentence for i, sentence in enumerate(sentences) if clusters[i] == cluster]
    
    # Create a dictionary for the cluster
    cluster_dict = {
        "cluster_id": cluster_id,
        "sentences": sentences_in_cluster
    }
    
    # Add the cluster dictionary to the list
    clusters_list.append(cluster_dict)

# Save the clusters to a JSON file
output_json_file = 'clusters.json'
with open(output_json_file, 'w', encoding='utf-8') as f:
    json.dump(clusters_list, f, ensure_ascii=False, indent=4)

print(f"Clusters have been saved to '{output_json_file}'")

Number of clusters: 6
Clusters have been saved to 'clusters.json'


Loading the LLM model

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Debugging function to help track where issues might be occurring
def debug_message(message):
    print(f"[DEBUG] {message}")

# Load the model and tokenizer once at the beginning
try:
    debug_message("Loading model and tokenizer...")
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token="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")
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", token="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")
    debug_message("Model and tokenizer loaded successfully.")
except Exception as e:
    debug_message(f"Error loading model or tokenizer: {str(e)}")
    raise


Prompt function Helps to generate the topics from the clusters

In [None]:

def generate_topic(sentences):
    try:
        # Combine sentences into a single string
        combined_sentences = " ".join(sentences)
        
        # Create a prompt for the model
        prompt = f"""Analysiere die folgenden Sätze und generiere einen präzisen, aussagekräftigen Themennamen von maximal 5 Worten. \
              Der Themenname sollte den Kerninhalt erfassen und spezifisch sein. \
                Sätze: {combined_sentences} Thema:"""
        
        # Tokenize input and generate the topic
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=10, num_return_sequences=1)
        
        # Decode the generated topic
        return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    except Exception as e:
        debug_message(f"Error during topic generation: {str(e)}")
        return "Topic generation failed"




Loading the Json file to feed the clustered data into LLM Model

In [None]:
# Load the clusters from the JSON file
input_json_file = 'clusters.json'
try:
    debug_message(f"Loading clusters from {input_json_file}...")
    with open(input_json_file, 'r', encoding='utf-8') as f:
        clusters_list = json.load(f)
    debug_message("Clusters loaded successfully.")
except Exception as e:
    debug_message(f"Error loading clusters: {str(e)}")
    raise

# List to hold the results with generated topics
clusters_with_topics = []

def process_clusters_sequentially():
    for cluster in clusters_list:
        try:
            # Extract cluster information
            cluster_id = cluster["cluster_id"]
            sentences = cluster["sentences"]
            
            # Skip the cluster with ID -1 (noise) and clusters with more than 500 sentences
            if cluster_id == -1:
                debug_message(f"Skipping noise cluster {cluster_id}")
                continue
            if len(sentences) > 500:
                debug_message(f"Skipping cluster {cluster_id} due to too many sentences ({len(sentences)}).")
                continue
            
            # Debug: Show the cluster info
            debug_message(f"Processing cluster {cluster_id} with {len(sentences)} sentences.")
            
            # Generate a topic for the cluster
            topic = generate_topic(sentences)
            
            # Add the generated topic to the cluster data
            cluster["topic"] = topic
            
            # Append the updated cluster to the results list
            clusters_with_topics.append(cluster)
            
            # Clear memory after processing each cluster
            torch.cuda.empty_cache()  # If you're using a GPU, clear the cache
            
            debug_message(f"Processed cluster {cluster_id}, Topic: {topic}")
        
        except Exception as e:
            debug_message(f"An error occurred while processing cluster {cluster_id}: {str(e)}")

# Process all clusters sequentially
try:
    process_clusters_sequentially()
except Exception as e:
    debug_message(f"Error during cluster processing: {str(e)}")
    raise

# Save the clusters with topics to a new JSON file
output_json_file = 'clusters_with_topics.json'
try:
    debug_message(f"Saving clusters with topics to {output_json_file}...")
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(clusters_with_topics, f, ensure_ascii=False, indent=4)
    debug_message("Clusters with topics saved successfully.")
except Exception as e:
    debug_message(f"Error saving clusters: {str(e)}")


Visualisation of clustered data in the main text corpus

In [None]:
import random
from collections import Counter

# Assuming 'clusters_with_topics' is already populated
sentence_cluster_map = {}
topic_map = {}

# Populate sentence_cluster_map and topic_map with clusters and topics
for cluster in clusters_with_topics:
    cluster_id = cluster["cluster_id"]
    topic = cluster.get("topic", f"Cluster {cluster_id}")
    topic_map[cluster_id] = topic
    for sentence in cluster["sentences"]:
        sentence_cluster_map[sentence] = cluster_id

def generate_random_color():
    return f"#{random.randint(0,255):02x}{random.randint(0,255):02x}{random.randint(0,255):02x}"

# Create cluster colors
cluster_colors = {-1: "#000000"}  # Noise points in black
for cluster in set(sentence_cluster_map.values()):
    if cluster != -1:
        cluster_colors[cluster] = generate_random_color()

def color_sentence(sentence, cluster):
    color = cluster_colors[cluster]
    cluster_id = sentence_cluster_map[sentence]
    return f'<span class="sentence" data-cluster="{cluster_id}" style="color:{color}">{sentence}</span>'

# Create the colored text
colored_text = []
for sentence in sentence_cluster_map:
    cluster = sentence_cluster_map[sentence]
    colored_sentence = color_sentence(sentence, cluster)
    colored_text.append(colored_sentence)

html_output = '. '.join(colored_text)

# Create the legend with topics
cluster_counts = Counter(sentence_cluster_map.values())
legend = '<div class="legend"><h2>Cluster Legend</h2><ul>'
for cluster, color in sorted(cluster_colors.items()):
    count = cluster_counts[cluster]
    if cluster == -1:
        legend += f'<li style="color:{color}">Noise: {count} sentences</li>'
    else:
        topic = topic_map.get(cluster, f"Cluster {cluster}")
        legend += f'<li class="legend-item" data-cluster="{cluster}" style="color:{color}; cursor:pointer;">{topic}: {count} sentences</li>'
legend += '</ul></div>'

# Add interactivity with JavaScript
javascript = """
<script>
    document.querySelectorAll('.legend-item').forEach(item => {
        item.addEventListener('click', function() {
            var cluster = this.getAttribute('data-cluster');
            document.querySelectorAll('.sentence').forEach(sentence => {
                if (sentence.getAttribute('data-cluster') === cluster) {
                    sentence.style.backgroundColor = 'yellow';  // Highlight selected cluster sentences
                } else {
                    sentence.style.backgroundColor = '';  // Reset other sentences
                }
            });
        });
    });
</script>
"""

# Save the HTML output to a file
with open('colored_clusters_with_legend.html', 'w', encoding='utf-8') as f:
    f.write(f"""
    <html>
    <head>
        <style>
            .legend {{ margin-bottom: 20px; }}
            .legend-item {{ margin-bottom: 5px; }}
            .sentence {{ padding: 2px; }}
        </style>
    </head>
    <body>
        {legend}
        <div class="text">{html_output}</div>
        {javascript}
    </body>
    </html>
    """)
