In [46]:
from abc import ABC, abstractmethod
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='train')

In [73]:
!wget https://raw.githubusercontent.com/igorbrigadir/stopwords/master/en/alir3z4.txt -O 'stopwords.txt'
!wget https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt -L -O 'popular-words.txt'

--2022-05-15 14:27:25--  https://raw.githubusercontent.com/igorbrigadir/stopwords/master/en/alir3z4.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7678 (7.5K) [text/plain]
Saving to: ‘stopwords.txt’


2022-05-15 14:27:25 (60.5 MB/s) - ‘stopwords.txt’ saved [7678/7678]

--2022-05-15 14:27:25--  https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75153 (73K) [text/plain]
Saving to: ‘popular-words.txt’


2022-05-15 14:27:25 

In [74]:
!ls

popular-words.txt  sample_data	stopwords.txt


In [75]:
from typing import List


# https://github.com/igorbrigadir/stopwords/blob/master/en/alir3z4.txt
with open('stopwords.txt') as stop_words_file:
    STOP_WORDS_ALIR3Z4 = stop_words_file.read().split('\n')

# https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-no-swears.txt
with open('popular-words.txt') as popular_words_file:
    POPULAR_WORDS = popular_words_file.read().split('\n')

POPULAR_TAGS = list(set(POPULAR_WORDS) - set(STOP_WORDS_ALIR3Z4))


def extract_words(text: str, alphabet: str, min_length: int = 3, stop_words: List[str] = None):
    """Split text into word."""
    stop_words = stop_words or []

    # filter symbols
    text = ''.join(
        (c if c in alphabet else ' ')
        for c in text.lower()
    )

    # split to words
    words = text.split()

    # filter words
    return [
        word
        for word in words
        if word not in stop_words and len(word) >= min_length
    ]


class BaseTagger(ABC):
    @abstractmethod
    def get_tags(self, texts: List[str]) -> List[List[str]]:
        """['Text1', 'Text2', ...] -> [['text1_tag1', 'text1_tag2', ...], ...]"""
        ...


class BaseSeparateTagger(BaseTagger, ABC):
    @abstractmethod
    def get_tags_from_text(self, text: str) -> List[str]:
        """'Text' -> ['text_tag1', 'text_tag2', ...]"""
        ...

    def get_tags(self, texts: List[str]) -> List[List[str]]:
        result = []
        for text in texts:
            tags = self.get_tags_from_text(text)
            result.append(tags)
        return result


class MostFrequentWordsTagger(BaseSeparateTagger):
    default_stop_words = STOP_WORDS_ALIR3Z4
    words_alphabet = 'abcdefghijklmnopqrstuvwxyz-\''

    def __init__(self, stop_words: list = None, max_tags_per_text: int = 5):
        self.stop_words = stop_words or self.default_stop_words
        self.max_tags_per_text = max_tags_per_text

    def get_tags_from_text(self, text: str) -> List[str]:
        words = extract_words(text, alphabet=self.words_alphabet, min_length=3, stop_words=self.stop_words)
        words_counter = Counter(words)

        # TODO improve heuristics
        tags = []
        result = words_counter.most_common()
        if len(result) == 0:
            return []

        word, max_count = result[0]
        i = 0
        while result[i][1] == max_count:
            tags.append(result[i][0])
            i += 1

        return tags[:self.max_tags_per_text]

In [76]:
corpus = [
"We are arriving in Rome via cruise ship on June 3, 2023. We are taking the transfer from the cruise ship that leaves at 7am. Our biggest priorities are the Vatican Museum/St Peter's Basilica and the Colloseum. My plan was to book a tour(or skip the line) at the Vatican starting at 9, thinking to stay 3 hours. Then taxi to the Colloseum, again tour or skip the line, tour at 1pm, staying 1.5 hours, possibly visiting the Forum (maybe 30 minutes). Is there any hope of seeing Trevi Fountain and the Pantheon as well? My plan would be to take a taxi from Colloseum to Trevi fountain, and then taxi to our transfer point near the Vatican where we have to be at 5pm, so that we maximize the time we have. Worst case scenario, we can see our 2 priority stops and nix the stop at Trevi fountain if we are running late. But wondering if this plan sounds feasible? And is there any small hope that we might be able to squish in a quick visit to Trevi fountain? Thanks for all and any advice! Jacqueline",
"Crocodile, (order Crocodylia, or Crocodilia), any of 23 species of generally large, ponderous, amphibious animals of lizard-like appearance and carnivorous habit belonging to the reptile order Crocodylia. Crocodiles have powerful jaws with many conical teeth and short legs with clawed webbed toes. They share a unique body form that allows the eyes, ears, and nostrils to be above the water surface while most of the animal is hidden below. The tail is long and massive, and the skin is thick and plated. Crocodiles are a living link with the dinosaur-like reptiles of prehistoric times and are the nearest living relatives of the birds. A large variety of crocodilian fossils have been discovered that date back 200 million years to the Late Triassic Epoch. Fossil evidence also suggests that three major radiations occurred. Only one of the four suborders of crocodiles has survived to modern times. The order Crocodylia includes the “true crocodiles,” alligators, caimans, and gavials.",
"When Netflix last month revealed that it had lost customers for the first time in a decade, you might have expected competing streaming services to be jubilant. In the two weeks after cancelling their subscription to Netflix, 87 per cent of subscribers had not signed up to a rival service, according to the analysis, which is based on around 3mn US internet users. Richard Broughton, director of research at Ampere, said that although there was an increase in churn rates at the start of the year, “there is no strong evidence to suggest that customers are being pulled away due to interest in other [streaming video] services”. The data suggest that a combination of higher inflation and a weakening stock market prompted consumers to tighten their budgets. Many of those that left the streaming service were aged between 18 and 24, or in households with an annual income of less than $15,000, according to the analysis. About 49 per cent of the poorest households surveyed said they had a Netflix subscription in the first quarter, down from about 56.2 per cent in the previous year.",
"Python language is incredibly easy to use and learn for new beginners and newcomers. The python language is one of the most accessible programming languages available because it has simplified syntax and not complicated, which gives more emphasis on natural language. Due to its ease of learning and usage, python codes can be easily written and executed much faster than other programming languages. When Guido van Rossum was creating python in the 1980s, he made sure to design it to be a general-purpose language. One of the main reasons for the popularity of python would be its simplicity in syntax so that it could be easily read and understood even by amateur developers also. One can also quickly experiment by changing the code base of python because it is an interpreted language which makes it even more popular among all kinds of developers.",
]

In [148]:
import numpy as np
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')


class MostFrequentWordsTagger_improved:
    default_stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))

    def __init__(self, stop_words: list = None, max_tags_per_text: int = 5):
        self.stop_words = stop_words or self.default_stop_words
        self.max_tags_per_text = max_tags_per_text
        self.vectorizer = TfidfVectorizer()
        self.corpus_vectorized = None

    def get_tags(self, texts: List[str], k: int = 5,
                 thr: float = 0.8, min_tag_len = 4) -> List[List[str]]:
        new_corpus = [' '.join([w for w in nltk.tokenize.word_tokenize(text.lower())
                                if w not in self.stop_words]) for text in texts]
        self.corpus_vectorized = self.vectorizer.fit_transform(new_corpus)
        all_words = self.vectorizer.get_feature_names_out()
        result = []
        for i in range(len(texts)):
            tfidf_vector = self.corpus_vectorized[i].toarray()[0]
            best_k_ixes = np.argpartition(tfidf_vector, -k)[-k:]
            scores = tfidf_vector[best_k_ixes]
            best_score = max(scores)
            mask = scores / best_score > thr
            best_ixes = best_k_ixes[mask]

            tags = [w for w in all_words[best_ixes] if len(w) >= min_tag_len]
            result.append(tags)
        return result

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [149]:
print(MostFrequentWordsTagger().get_tags(corpus))

[['trevi', 'fountain'], ['crocodiles'], ['netflix', 'streaming', 'cent'], ['python']]


In [150]:
print(MostFrequentWordsTagger_improved().get_tags(corpus))

[['fountain', 'trevi'], ['crocodiles'], ['streaming', 'cent', 'netflix'], ['language', 'python']]


We see that with improved tagger we get better results for 4-d document. Also we now don't use stopwords from stopwords.txt but import them from nltk. Also our tagger is more flexible and controllable.