In [3]:
# First let import the most necessary libs
import pandas as pd
import numpy as np
# Library to import pre-trained model for sentence embeddings
from sentence_transformers import SentenceTransformer
# Calculate similarities between sentences
from sklearn.metrics.pairwise import cosine_similarity
# Visualization library
import seaborn as sns
import matplotlib.pyplot as plt
# package for finding local minimas
from scipy.signal import argrelextrema
import math
import pysbd


In [4]:
seg = pysbd.Segmenter(language="en", clean=False)

# First lets load the transcript
with open('../data/podcast_transcript.txt') as f:
    doc = f.readlines()
    f.close()
# We need to split whole text into sentences first.
sentences = seg.segment(doc[0])

print('Loaded Text String')


Loaded Text String


In [5]:
# Loading a model - don't try it at home, it might take some time - it is 420 mb
model = SentenceTransformer('all-mpnet-base-v2')


In [6]:
# Get the length of each sentence
sentece_length = [len(each) for each in sentences]
# Determine longest outlier
long = np.mean(sentece_length) + np.std(sentece_length) *2
# Determine shortest outlier
short = np.mean(sentece_length) - np.std(sentece_length) *2
# Shorten long sentences
text = ''
for each in sentences:
    if len(each) > long:
        # let's replace all the commas with dots
        comma_splitted = each.replace(',', '.')
    else:
        text+= f'{each}. '
sentences = text.split('. ')
# Now let's concatenate short ones
text = ''
for each in sentences:
    if len(each) < short:
        text+= f'{each} '
    else:
        text+= f'{each}. '


In [7]:
# Split text into sentences
sentences = text.split('. ')
# Embed sentences
embeddings = model.encode(sentences)
print(embeddings.shape)


(1030, 768)


In [22]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.stats import entropy

# Assuming 'embeddings' is a NumPy array of shape (num_sentences, 768)
# Each row of 'embeddings' represents a 768-dimensional embedding

# Step 1: Normalize the embeddings
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
normalized_embeddings = embeddings / norms

# Step 2: Estimate density using k-nearest neighbors
knn = NearestNeighbors(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(normalized_embeddings)
distances, indices = knn.kneighbors(normalized_embeddings)

# Use the distances to estimate density. Here we use the distance to the k-th nearest neighbor as a proxy for density.
densities = 1 / (distances[:, -1] + 1e-10)  # Add a small constant to avoid division by zero

# Step 3: Normalize the densities to get a probability distribution
probabilities = densities / np.sum(densities)

# Step 4: Calculate the entropy of the distribution
diversity_entropy = entropy(probabilities)

print("Diversity Entropy:", diversity_entropy)

# Step 5: Select a subset of sentences to maximize entropy
# Here, we'll simply sort the sentences by their estimated density and select the top-N
selected_indices = np.argsort(-densities)
selected_sentences = embeddings[selected_indices]

print("Selected Sentences Indices:", selected_indices)
len(selected_indices)


Diversity Entropy: 6.93228488330834
Selected Sentences Indices: [ 443  857  132 ... 1019  737  486]


1030

In [26]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.ndimage import gaussian_filter1d

# Assuming 'embeddings' is a NumPy array of shape (num_sentences, embedding_dim)
# Each row of 'embeddings' represents the embedding of a sentence
# 'sentences' is a list of strings, where each string is a sentence

# Step 1: Normalize the embeddings
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
normalized_embeddings = embeddings / norms

# Step 2: Estimate density using k-nearest neighbors
knn = NearestNeighbors(n_neighbors=10)  # Adjust the number of neighbors as needed
knn.fit(normalized_embeddings)
distances, indices = knn.kneighbors(normalized_embeddings)

# Use the distances to estimate density
densities = 1 / (np.mean(distances, axis=1) + 1e-10)  # Using mean distance to neighbors

# Step 3: Smooth the density estimates (optional)
smoothed_densities = gaussian_filter1d(densities, sigma=5)  # Adjust sigma as needed

# Step 4: Identify valleys in the density as potential chapter boundaries
valley_indices = np.where(smoothed_densities < np.mean(smoothed_densities))[0]

# Step 5: Segment text into chapters based on valley indices
chapters = []
start_idx = 0
for end_idx in valley_indices:
    chapter = " ".join(sentences[start_idx:end_idx])
    chapters.append(chapter)
    start_idx = end_idx
chapters.append(" ".join(sentences[start_idx:]))  # Add the last chapter

# Step 6: Print the chapters
for i, chapter in enumerate(chapters, start=1):
    print(f"Chapter {i}:\n{chapter}\n{'-'*50}")


Chapter 1:

--------------------------------------------------
Chapter 2:
In the year 1625, an Italian nobleman named Pietro de la Valet went on a tour of the Middle East
--------------------------------------------------
Chapter 3:
 De la Valle was a prolific traveler
--------------------------------------------------
Chapter 4:
 He journeyed around Asia, North Africa and even India
--------------------------------------------------
Chapter 5:
 He married an Assyrian Christian princess in Damascus and now the two of them traveled together, journeying by horseback and camel, accompanied by local guides
--------------------------------------------------
Chapter 6:
 At this time, travel in this region couldn't have been more dangerous
--------------------------------------------------
Chapter 7:
 The Ottoman and Persian empires were at war, fighting over who would rule in Baghdad
--------------------------------------------------
Chapter 8:
 And meanwhile, local bandits took advantage of

In [27]:
valley_indices


array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
         33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,
         44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,
         55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,
         66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,
         77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,
         88,   89,   90,   91,   92,   93,  157,  158,  159,  160,  161,
        162,  163,  164,  165,  166,  167,  168,  169,  170,  171,  172,
        173,  174,  175,  176,  177,  178,  179,  180,  181,  182,  183,
        184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
        223,  224,  225,  226,  227,  228,  229,  230,  231,  243,  244,
        245,  251,  252,  253,  254,  255,  256,  2

In [21]:
sentences


['In the year 1625, an Italian nobleman named Pietro de la Valet went on a tour of the Middle East',
 ' De la Valle was a prolific traveler',
 ' He journeyed around Asia, North Africa and even India',
 ' He married an Assyrian Christian princess in Damascus and now the two of them traveled together, journeying by horseback and camel, accompanied by local guides',
 " At this time, travel in this region couldn't have been more dangerous",
 ' The Ottoman and Persian empires were at war, fighting over who would rule in Baghdad',
 ' And meanwhile, local bandits took advantage of the chaos to prey on travelers',
 ' In those days, lions even roamed in these hills',
 ' Due to these various dangers, dela Valets guides were constantly on edge',
 ' It was June 18, 1625 when they spotted a distant group of tribesmen on the horizon',
 ' Their guides decided that they might be in danger and began to search for a place to hide',
 ' In the distance, they spotted the looming mass of a series of enormou