In [1]:
!pip install ruptures

Collecting ruptures
  Downloading ruptures-1.1.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: ruptures
Successfully installed ruptures-1.1.7


In [32]:
from pathlib import Path

import nltk
import numpy as np
import ruptures as rpt  # our package
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import regexp_tokenize
from ruptures.base import BaseCost
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LogNorm# Start writing code here...

In [24]:
180/100

1.8

In [25]:
nltk.download("stopwords")
STOPWORD_SET = set(
    stopwords.words("english")
)  # set of stopwords of the English language
PUNCTUATION_SET = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/poloniki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def preprocess(list_of_sentences: list) -> list:
    """Preprocess each sentence (remove punctuation, stopwords, then stemming.)"""
    transformed = list()
    for sentence in list_of_sentences:
        ps = PorterStemmer()
        list_of_words = regexp_tokenize(text=sentence.lower(), pattern="\w+")
        list_of_words = [
            ps.stem(word) for word in list_of_words if word not in STOPWORD_SET
        ]
        transformed.append(" ".join(list_of_words))
    return transformed

In [27]:
def draw_square_on_ax(start, end, ax, linewidth=0.8):
    """Draw a square on the given ax object."""
    ax.vlines(
        x=[start - 0.5, end - 0.5],
        ymin=start - 0.5,
        ymax=end - 0.5,
        linewidth=linewidth,
    )
    ax.hlines(
        y=[start - 0.5, end - 0.5],
        xmin=start - 0.5,
        xmax=end - 0.5,
        linewidth=linewidth,
    )
    return ax

In [28]:
with open('../quint/data/sample_texts/joe_rogan_tr.txt') as f:
    doc = f.readlines()


In [29]:
doc = doc[0].replace("?", ".")

In [30]:
original_text = doc.split('.')


In [31]:
# transform text
transformed_text = preprocess(original_text)
# print original and transformed
ind = 3
print("Original sentence:")
print(f"\t{original_text[ind]}")
print()
print("Transformed:")
print(f"\t{transformed_text[ind]}")

Original sentence:
	 And things like that

Transformed:
	thing like


In [33]:
# Once the text is preprocessed, each sentence is transformed into a vector of word counts.
#vectorizer = CountVectorizer(analyzer="word")
vectorizer = TfidfVectorizer(min_df=5, max_df=95, ngram_range=(1, 3), stop_words="english")

vectorized_text = vectorizer.fit_transform(transformed_text)

msg = f"There are {len(vectorizer.get_feature_names())} different words in the corpus, e.g. {vectorizer.get_feature_names()[20:30]}."
print(msg)

There are 522 different words in the corpus, e.g. ['american', 'american peopl', 'anoth', 'answer', 'anybodi', 'anymor', 'anyth', 'app', 'appear', 'appli'].




In [34]:
class CosineCost(BaseCost):
    """Cost derived from the cosine similarity."""

    # The 2 following attributes must be specified for compatibility.
    model = "custom_cosine"
    min_size = 2

    def fit(self, signal):
        """Set the internal parameter."""
        self.signal = signal
        self.gram = cosine_similarity(signal, dense_output=False)
        return self

    def error(self, start, end) -> float:
        """Return the approximation cost on the segment [start:end].

        Args:
            start (int): start of the segment
            end (int): end of the segment
        Returns:
            segment cost
        Raises:
            NotEnoughPoints: when the segment is too short (less than `min_size` samples).
        """
        if end - start < self.min_size:
            raise NotEnoughPoints
        sub_gram = self.gram[start:end, start:end]
        val = sub_gram.diagonal().sum()
        val -= sub_gram.sum() / (end - start)
        return val

In [47]:
#n_bkps = 9  # there are 9 change points (10 text segments)

#algo = rpt.Dynp(custom_cost=CosineCost(), min_size=2, jump=1).fit(vectorized_text)
algo = rpt.Pelt(custom_cost=CosineCost(), min_size=3, jump=1).fit(vectorized_text)

#predicted_bkps = algo.predict(n_bkps=n_bkps)
predicted_bkps = algo.predict(pen=1.5)

#print(f"True change points are\t\t{TRUE_BKPS}.")
print(f"Detected change points are\t{predicted_bkps}.")

Detected change points are	[48, 51, 125, 130, 221, 224, 255, 258, 345, 348, 440, 447, 470, 474, 481, 522, 526, 619, 622, 688, 691, 706, 740, 746, 777, 780, 816, 819, 839, 870, 873, 904, 911, 917, 947, 952, 981, 990, 994, 1070, 1088, 1101, 1150, 1156, 1219, 1222, 1228, 1233, 1315, 1319, 1377, 1395, 1398, 1435, 1440, 1489, 1492, 1497, 1501, 1519, 1522, 1526, 1555, 1558, 1585, 1589, 1661, 1675, 1720, 1731, 1745, 1748, 1762, 1765, 1809, 1812, 1834, 1842, 1851, 1859, 1879, 1882, 1889].


In [36]:
print(f"Detected change points are\t{predicted_bkps}.")

Detected change points are	[740, 743, 1228, 1233, 1591, 1720, 1731, 1851, 1859, 1889].


In [37]:
true_segment_list = rpt.utils.pairwise([0] + TRUE_BKPS)
predicted_segment_list = rpt.utils.pairwise([0] + predicted_bkps)

for (n_paragraph, (true_segment, predicted_segment)) in enumerate(
    zip(true_segment_list, predicted_segment_list), start=1
):
    print(f"Paragraph n°{n_paragraph:02d}")
    start_true, end_true = true_segment
    start_pred, end_pred = predicted_segment

    start = min(start_true, start_pred)
    end = max(end_true, end_pred)
    msg = " ".join(
        f"{ind+1:02d}" if (start_true <= ind < end_true) else "  "
        for ind in range(start, end)
    )
    print(f"(true)\t{msg}")
    msg = " ".join(
        f"{ind+1:02d}" if (start_pred <= ind < end_pred) else "  "
        for ind in range(start, end)
    )
    print(f"(pred)\t{msg}")
    print()

NameError: name 'TRUE_BKPS' is not defined

In [None]:
df['Topics']

In [48]:
for (start, end) in rpt.utils.pairwise([0] + predicted_bkps):
    excerpt = original_text[start:end]
    for (n_line, sentence) in enumerate(excerpt, start=start + 1):
        sentence = sentence.strip("\n")
        print(f"{n_line:>2}: {sentence}")
    print()

 1: Okay
 2:  You're very professional
 3:  People are like, how do you live
 4:  And things like that
 5:  They're taking money from the russians
 6:  And of course the answer is no, but I do this for a living, like I speak
 7:  I don't have a YouTube channel where I'm joe rogan, but I give speeches at universities and things like that
 8:  I do a lot of interviews
 9:  We're recording now, right
10:  My own setup
11:  Is it possible that you could do a YouTube channel
12:  Would that work
13:  Yeah, if you introduce me so I get followers
14:  Yeah, we could do that, dude, I'm all in
15:  That could absolutely happen
16:  Do you want to do that
17:  Is that something you want to do
18:  No
19:  I mean, this is a big question
20:  So I came on because I had just written a book called permanent record, which is the story of my life, because that's what publishers make you do when you're writing your first book
21:  But it's more than that, because I didn't just want to talk about me
22: