In [1]:
!pip install ruptures

Collecting ruptures
  Downloading ruptures-1.1.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: ruptures
Successfully installed ruptures-1.1.7


In [2]:
from pathlib import Path

import nltk
import numpy as np
import ruptures as rpt  # our package
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import regexp_tokenize
from ruptures.base import BaseCost
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LogNorm# Start writing code here...

In [3]:
nltk.download("stopwords")
STOPWORD_SET = set(
    stopwords.words("english")
)  # set of stopwords of the English language
PUNCTUATION_SET = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def preprocess(list_of_sentences: list) -> list:
    """Preprocess each sentence (remove punctuation, stopwords, then stemming.)"""
    transformed = list()
    for sentence in list_of_sentences:
        ps = PorterStemmer()
        list_of_words = regexp_tokenize(text=sentence.lower(), pattern="\w+")
        list_of_words = [
            ps.stem(word) for word in list_of_words if word not in STOPWORD_SET
        ]
        transformed.append(" ".join(list_of_words))
    return transformed

In [5]:
def draw_square_on_ax(start, end, ax, linewidth=0.8):
    """Draw a square on the given ax object."""
    ax.vlines(
        x=[start - 0.5, end - 0.5],
        ymin=start - 0.5,
        ymax=end - 0.5,
        linewidth=linewidth,
    )
    ax.hlines(
        y=[start - 0.5, end - 0.5],
        xmin=start - 0.5,
        xmax=end - 0.5,
        linewidth=linewidth,
    )
    return ax

In [4]:
with open('../quint/data/sample_texts/joe_rogan_tr.txt') as f:
    doc = f.readlines()
doc = doc[0].replace("?", ".")
sentences = doc.split('.')
original_text = sentences.split(". ")

FileNotFoundError: [Errno 2] No such file or directory: "../quint/data/sample_texts/joe_rogan_tr.txt'"

In [7]:
# transform text
transformed_text = preprocess(original_text)
# print original and transformed
ind = 3
print("Original sentence:")
print(f"\t{original_text[ind]}")
print()
print("Transformed:")
print(f"\t{transformed_text[ind]}")

Original sentence:
	They're taking money from the russians

Transformed:
	take money russian


In [8]:
# Once the text is preprocessed, each sentence is transformed into a vector of word counts.
vectorizer = CountVectorizer(analyzer="word")
vectorized_text = vectorizer.fit_transform(transformed_text)

msg = f"There are {len(vectorizer.get_feature_names())} different words in the corpus, e.g. {vectorizer.get_feature_names()[20:30]}."
print(msg)

There are 2203 different words in the corpus, e.g. ['2012', '2013', '2016', '2019', '20th', '230', '24', '260', '2776', '30'].


In [9]:
class CosineCost(BaseCost):
    """Cost derived from the cosine similarity."""

    # The 2 following attributes must be specified for compatibility.
    model = "custom_cosine"
    min_size = 2

    def fit(self, signal):
        """Set the internal parameter."""
        self.signal = signal
        self.gram = cosine_similarity(signal, dense_output=False)
        return self

    def error(self, start, end) -> float:
        """Return the approximation cost on the segment [start:end].

        Args:
            start (int): start of the segment
            end (int): end of the segment
        Returns:
            segment cost
        Raises:
            NotEnoughPoints: when the segment is too short (less than `min_size` samples).
        """
        if end - start < self.min_size:
            raise NotEnoughPoints
        sub_gram = self.gram[start:end, start:end]
        val = sub_gram.diagonal().sum()
        val -= sub_gram.sum() / (end - start)
        return val

In [10]:
n_bkps = 9  # there are 9 change points (10 text segments)

algo = rpt.Dynp(custom_cost=CosineCost(), min_size=2, jump=1).fit(vectorized_text)
predicted_bkps = algo.predict(n_bkps=n_bkps)

print(f"True change points are\t\t{TRUE_BKPS}.")
print(f"Detected change points are\t{predicted_bkps}.")

KeyboardInterrupt: 

In [18]:
print(f"Detected change points are\t{predicted_bkps}.")

Detected change points are	[504, 507, 579, 751, 754, 1370, 1463, 1474, 1601, 1611].


In [None]:
true_segment_list = rpt.utils.pairwise([0] + TRUE_BKPS)
predicted_segment_list = rpt.utils.pairwise([0] + predicted_bkps)

for (n_paragraph, (true_segment, predicted_segment)) in enumerate(
    zip(true_segment_list, predicted_segment_list), start=1
):
    print(f"Paragraph n°{n_paragraph:02d}")
    start_true, end_true = true_segment
    start_pred, end_pred = predicted_segment

    start = min(start_true, start_pred)
    end = max(end_true, end_pred)
    msg = " ".join(
        f"{ind+1:02d}" if (start_true <= ind < end_true) else "  "
        for ind in range(start, end)
    )
    print(f"(true)\t{msg}")
    msg = " ".join(
        f"{ind+1:02d}" if (start_pred <= ind < end_pred) else "  "
        for ind in range(start, end)
    )
    print(f"(pred)\t{msg}")
    print()

In [20]:
for (start, end) in rpt.utils.pairwise([0] + predicted_bkps):
    excerpt = original_text[start:end]
    for (n_line, sentence) in enumerate(excerpt, start=start + 1):
        sentence = sentence.strip("\n")
        print(f"{n_line:>2}: {sentence}")
    print()

620: What we were pitching was a private cloud system
621: Everybody knows about cloud computing now
622: It's like why your gmail account is available wherever you go
623: It's why facebook has this massive system of records for everyone everywhere
624: The government wanted to have these kind of capabilities
625: To dell ended up getting beat out by amazon
626: Some people aren't familiar with this
627: Many of them are, but amazon runs a secret cloud system for the government
628: I forget what they've rebranded it now, but there's this massive connection between industry and government in the classified space that just goes deeper and deeper and deeper and deeper
629: But at this point, I had misgivings because of what I'd seen in japan about government
630: But I was just trying to get by
631: I was trying to ignore the conflicts
632: I was trying to ignore the inconsistencies
633: And I think this is a state that a lot of people in these large institutions, not just in our countr

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=42b72ec6-04a6-4083-9286-1ab690f9f57d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>