## Loading the dataset

In [None]:
! pip install -U datasets

In [1]:
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6407814
    })
})

In [3]:
import random
length = len(dataset["train"])
random_indices = random.sample(range(length), 10000)
articles = [dataset["train"][idx] for idx in random_indices]

In [4]:
articles = [x["text"] for x in articles]

## Preprocessing

In [None]:
# remove stop words
# remove punctuations
# change the articles to lower case

In [21]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [6]:
for i, article in enumerate(articles):
    
    # remove punctuations
    PUNCTUATION = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" 
    article = ''.join([c for c in article if c not in PUNCTUATION])
    
    # remove the stop words and convert to lower case
    article = ' '.join([word for word in article.lower().split() if word not in stop])
    
    articles[i] = article
    

## Vectorization

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.5, min_df=5, stop_words="english")

In [8]:
articles_tfidf = vectorizer.fit_transform(articles)
articles_tfidf.shape

(10000, 31101)

## Extracting keywords

In [38]:
# Find the feature names
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['00', '000', '001', ..., 'świętokrzyskie', 'μm', 'на'],
      dtype=object)

In [41]:
feature_names[6657]

'city'

In [44]:
articles_tfidf[0, 6657]

0.052133096235709296

In [52]:
from tqdm import tqdm
NUM_KEYWORDS = 50
keywords = []
for doc_index in tqdm(range(len(articles))):
    # find the feature index
    feature_index = articles_tfidf[doc_index, :].nonzero()[1]
    
    # find the score for feature index
    tfidf_scores = zip(feature_index, [articles_tfidf[doc_index, x] for x in feature_index])
    
    # sort these scores
    sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    
    top_keywords = " ".join(([feature_names[i] for i, s in sorted_tfidf_scores[:NUM_KEYWORDS]]))
    
    keywords.append(top_keywords)
    

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:42<00:00, 232.85it/s]


In [54]:
import pandas as pd

df = pd.DataFrame({
    "article": articles,
    "keywords": keywords
})

In [55]:
df.head()

Unnamed: 0,article,keywords
0,estadio municipal de balboa multipurpose stadi...,panama balboa stadium multipurpose venues foot...
1,dawn debut studio album american singersongwri...,albums dawn signifies rca album records produc...
2,albert burnett born 10 october 1955 scottish f...,juniors scottish hampden dumbarton albert falk...
3,ckfffm first nations community radio station o...,radio quebec 1041 mhz antenna fm station 2020 ...
4,noblemen indian english language drama film di...,shay noblemen film ali haji murali films india...
