In [2]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2025.9.1-cp311-cp311-win_amd64.whl.metadata (41 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2025.9.1-cp311-cp311-win_amd64.whl (276 kB)
Using cached click-8.2.1-py3-none-any.whl (102 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.2.1 nltk-3.9.1 regex-2025.9.1



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from huggingface_hub import hf_hub_download

import nltk 
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

from collections import Counter
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\2019c\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\2019c\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
# download dataset of Medium articles from 
# https://huggingface.co/datasets/fabiochiu/medium-articles
df_articles = pd.read_csv(
  hf_hub_download("fabiochiu/medium-articles", repo_type="dataset",
                  filename="medium_articles.csv")
)

# There are 192,368 articles in total, but let's keep only the first 10,000 to
# make computations faster
df_articles = df_articles[:10000]

df_articles.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [8]:
# count the number of occurrences of each token in each text
texts_lowercase = df_articles["text"].str.lower()
texts_lowercase_tokenized = texts_lowercase.apply(word_tokenize)
token_counters = texts_lowercase_tokenized.apply(Counter).values.tolist()

# show the tokens found in the first article with at least 10 occurrences
print({token: n_occ for token, n_occ in token_counters[0].items() if n_occ >= 10})

{'and': 32, ',': 52, 'we': 15, 'to': 30, 'for': 13, '.': 39, '’': 23, 'be': 13, 'that': 14, 'of': 23, '“': 12, 'the': 31, 'a': 16, '”': 11, 'i': 25, 'can': 16, 'it': 17, 's': 10}


In [9]:
# tokenize the query
query = "data science nlp"
query_tokens = word_tokenize(query)

In [10]:
# computer a matching score for each text with respect to the query. The score is the number of times each token in the query can be found in a specific text.
def get_scores(query_tokens, token_counters):
    scores = []
    for token_counter in token_counters:
        matches = [token_counter[query_token] for query_token in query_tokens]
        total_score = sum(matches)
        scores.append(total_score)
    return scores

scores = get_scores(query_tokens, token_counters)

In [11]:
# retrieve the top_n articles with the highest scores and show them 

def show_best_results(df_articles, scores, top_n = 10):
    best_indexes = np.argsort(scores)[::-1][:top_n]
    for position, idx in enumerate(best_indexes):
        row = df_articles.iloc[idx]
        title = row["title"]
        score = scores[idx]
        print(f"{position+1} [score = {score}]: {title}")

show_best_results(df_articles, scores)

1 [score = 186]: The Top Online Data Science Courses for 2019
2 [score = 132]: How Much Do You Know About Your Data And Is Your Product Ready To Benefit From Data Science?
3 [score = 122]: Under the Hood of K-Nearest Neighbors (KNN) and Popular Model Validation Techniques
4 [score = 122]: Streaming Real-time data to AWS Elasticsearch using Kinesis Firehose
5 [score = 121]: Financial Times Data Platform: From zero to hero
6 [score = 118]: No data governance, no data intelligence!
7 [score = 107]: Data Science for Everyone: Getting To Know Your Data — Part 1
8 [score = 102]: Data Science, the Good, the Bad, and the… Future
9 [score = 102]: A Layman’s Guide to Data Science: How to Become a (Good) Data Scientist
10 [score = 98]: Data Science Minimum: 10 Essential Skills You Need to Know to Start Doing Data Science


In [12]:
# try a different query
query = "how to learn data science"
query_tokens = word_tokenize(query)
scores = get_scores(query_tokens, token_counters)
show_best_results(df_articles, scores)

1 [score = 589]: How to Make Your First $10,000 as a Freelance Writer
2 [score = 583]: Russ Roberts and Tyler on COVID-19 (Ep. 90 — BONUS)
3 [score = 526]: The Big Disruption
4 [score = 461]: Sam Altman on Loving Community, Hating Coworking, and the Hunt for Talent (Ep. 61 — Live)
5 [score = 394]: Paul Romer on a Culture of Science and Working Hard (Ep. 96)
6 [score = 392]: Nicholas Bloom on Management, Productivity, and Scientific Progress (Ep. 102)
7 [score = 349]: SXSW 2019 Ultimate Guide to the Panels, Popups and Parties
8 [score = 341]: Glen Weyl on Fighting COVID-19 and the Role of the Academic Expert (Ep. 94 — BONUS)
9 [score = 319]: The Top Online Data Science Courses for 2019
10 [score = 308]: A Deep Conceptual Guide to Mutual Information


Unfortunately, this time the results are not as good as before. Why?

That’s because the query contains tokens like “how” and “to”, which are very frequent in most of the articles in the dataset. As a consequence, the articles with the majority of these tokens are returned and tokens like “data” and “science” have less influence on the results.

In [13]:
# removing stopwords
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import string

english_stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\2019c\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Punctuation characters can be all found in the string string.punctuation.



In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
texts_lowercase = df_articles["text"].str.lower()
texts_lowercase_tokenized = texts_lowercase.apply(word_tokenize)
texts_lowercase_tokenized_no_sw = texts_lowercase_tokenized.apply(lambda token_list: [token for token in token_list
                                                                                      if token not in english_stopwords and token not in string.punctuation ])

In [16]:
token_counters = texts_lowercase_tokenized_no_sw.apply(Counter).values.tolist()
# show the tokens found in the first article with at least 6 occurrences
print({token: n_occ for token, n_occ in token_counters[0].items() if n_occ >= 6})

{'’': 23, '“': 12, 'us': 6, '”': 11, 'life': 6}


So, let’s try again the "how to learn data science" query and see the results.



In [17]:
# tokenize the query and remove stopwords
query = "how to learn data science"
query_tokens = word_tokenize(query)
query_tokens_no_sw = [token for token in query_tokens if token not in english_stopwords and token not in string.punctuation]
print(f"Tokenized query without stopwords: {query_tokens_no_sw}")
print()

Tokenized query without stopwords: ['learn', 'data', 'science']



In [18]:
#show the best results
scores =  get_scores(query_tokens, token_counters)
show_best_results(df_articles, scores)

1 [score = 200]: The Top Online Data Science Courses for 2019
2 [score = 132]: How Much Do You Know About Your Data And Is Your Product Ready To Benefit From Data Science?
3 [score = 124]: Under the Hood of K-Nearest Neighbors (KNN) and Popular Model Validation Techniques
4 [score = 123]: Streaming Real-time data to AWS Elasticsearch using Kinesis Firehose
5 [score = 121]: Financial Times Data Platform: From zero to hero
6 [score = 119]: No data governance, no data intelligence!
7 [score = 107]: A Layman’s Guide to Data Science: How to Become a (Good) Data Scientist
8 [score = 107]: Data Science for Everyone: Getting To Know Your Data — Part 1
9 [score = 104]: Data Science, the Good, the Bad, and the… Future
10 [score = 99]: Data Science Minimum: 10 Essential Skills You Need to Know to Start Doing Data Science


# Code Exercises

In [1]:
#reimplement the search engine logic using countvectorizer class from sklearn
from sklearn.feature_extraction.text import CountVectorizer


In [6]:
df_articles.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [7]:
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df_articles["text"])
X.shape

(10000, 109729)

In [8]:
query = "how to learn data science"
query_vector = vectorizer.transform([query])
query_vector.shape

(1, 109729)

In [24]:
indices = query_vector.dot(X.T).toarray()[0].argsort()[::-1][:10]
indices

array([4209, 9507, 4756, 3946, 3792, 9310, 5916, 2081, 4151, 3707])

In [25]:
df_articles.loc[indices]

Unnamed: 0,title,text,url,authors,timestamp,tags
4209,The Top Online Data Science Courses for 2019,After over 80+ hours of watching course videos...,https://medium.com/free-code-camp/top-7-online...,[],2019-05-02 20:18:44.339000+00:00,"['Artificial Intelligence', 'Machine Learning'..."
9507,How Much Do You Know About Your Data And Is Yo...,How Much Do You Know About Your Data And Is Yo...,https://medium.com/datadriveninvestor/four-hur...,['Wilson Wong'],2020-10-19 05:01:43.511000+00:00,"['Big Data', 'Innovation Management', 'Return ..."
4756,Under the Hood of K-Nearest Neighbors (KNN) an...,Under the Hood of K-Nearest Neighbors (KNN) an...,https://medium.com/swlh/under-the-hood-of-k-ne...,['Boulevard Consulting'],2020-09-03 18:51:10.446000+00:00,"['Iris', 'Python', 'K Nearest Neighbors', 'Cro..."
3946,Streaming Real-time data to AWS Elasticsearch ...,Explore how we can deliver real-time data usin...,https://medium.com/swlh/streaming-real-time-da...,['Janitha Tennakoon'],2020-06-03 10:16:44.374000+00:00,"['Big Data', 'Kinesis', 'AWS', 'Elasticsearch'..."
3792,Financial Times Data Platform: From zero to hero,Financial Times Data Platform: From zero to he...,https://medium.com/ft-product-technology/finan...,['Mihail Petkov'],2020-12-02 09:59:40.123000+00:00,"['Financial Times', 'Analytics', 'Engineering'..."
9310,"No data governance, no data intelligence!",Photo by Amanda Jones on Unsplash\n\nAny big t...,https://towardsdatascience.com/no-data-governa...,['Hassan Lâasri'],2019-11-12 20:41:37.264000+00:00,"['Big Data', 'Data Analytics', 'Data Strategy'..."
5916,"Data Science, the Good, the Bad, and the… Future",How often do you think you’re touched by data ...,https://medium.com/kitepython/data-science-the...,['Kirit Thadaka'],2019-08-07 17:15:53.980000+00:00,"['Privacy', 'Responsible Data Science', 'Pytho..."
2081,A Layman’s Guide to Data Science: How to Becom...,How simple is Data Science?\n\nSometimes when ...,https://medium.com/sciforce/a-laymans-guide-to...,[],2020-01-06 15:43:36.312000+00:00,"['Programming', 'Machine Learning', 'Data Scie..."
4151,Data Science for Everyone: Getting To Know You...,Data: Formulating the Concepts\n\nDefinitions\...,https://medium.com/towards-artificial-intellig...,['Sumudu Tennakoon'],2020-12-24 01:03:36.911000+00:00,"['Data Science', 'Machine Learning', 'Data Sci..."
3707,Data science… without any data?!,Data science… without any data?!\n\nWhy it’s i...,https://towardsdatascience.com/data-science-wi...,['Cassie Kozyrkov'],2020-11-13 14:56:17.278000+00:00,"['Data Science', 'Technology', 'Data Engineeri..."


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize the corpus
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df_articles["text"])  # Document-term matrix

# Process query
query = "how to learn data science"
query_vector = vectorizer.transform([query])

# Compute cosine similarity
similarities = cosine_similarity(query_vector, X).flatten()  # shape: (num_documents,)

# Get top 10 most similar articles
top_indices = similarities.argsort()[::-1][:10]

# Retrieve top articles
top_articles = df_articles.iloc[top_indices]
top_articles


Unnamed: 0,title,text,url,authors,timestamp,tags
5055,Data Science Scholarships-Full-list Compilations.,Data Science has been described as one of the ...,https://medium.com/total-data-science/data-sci...,[],2020-11-14 01:44:52.109000+00:00,"['Machine Learning', 'Data Science', 'Artifici..."
5223,5 Steps to Become a Data Scientist,Data Scientist image. Source: www.quora.com\n\...,https://medium.com/towards-artificial-intellig...,['Benjamin Obi Tayo Ph.D.'],2020-06-11 17:21:11.092000+00:00,"['Artificial Intelligence', 'Data Science', 'F..."
5916,"Data Science, the Good, the Bad, and the… Future",How often do you think you’re touched by data ...,https://medium.com/kitepython/data-science-the...,['Kirit Thadaka'],2019-08-07 17:15:53.980000+00:00,"['Privacy', 'Responsible Data Science', 'Pytho..."
3707,Data science… without any data?!,Data science… without any data?!\n\nWhy it’s i...,https://towardsdatascience.com/data-science-wi...,['Cassie Kozyrkov'],2020-11-13 14:56:17.278000+00:00,"['Data Science', 'Technology', 'Data Engineeri..."
4209,The Top Online Data Science Courses for 2019,After over 80+ hours of watching course videos...,https://medium.com/free-code-camp/top-7-online...,[],2019-05-02 20:18:44.339000+00:00,"['Artificial Intelligence', 'Machine Learning'..."
6723,Becoming a Data scientist: which path to take?,Becoming a Data scientist: which path to take?...,https://towardsdatascience.com/becoming-a-data...,['Chan Naseeb'],2020-06-08 13:58:10.917000+00:00,"['Deep Learning', 'Artificial Intelligence', '..."
3609,Roadmap to Becoming a Successful Data Scientist,1) Python Basics\n\nIn order to become a data ...,https://medium.com/dataseries/roadmap-to-becom...,['Mustufa Ansari'],2020-08-28 09:10:51.156000+00:00,"['Machine Learning', 'Artificial Intelligence'..."
2081,A Layman’s Guide to Data Science: How to Becom...,How simple is Data Science?\n\nSometimes when ...,https://medium.com/sciforce/a-laymans-guide-to...,[],2020-01-06 15:43:36.312000+00:00,"['Programming', 'Machine Learning', 'Data Scie..."
5241,The Difference Between Theory and Theorem and ...,The Difference Between Theory and Theorem and ...,https://everydayjunglist.medium.com/the-differ...,['Daniel Demarco'],2018-11-01 02:27:31.363000+00:00,"['Philosophy Of Science', 'Philosophy', 'Data ..."
6950,Freelance your way into Data Science now,Freelance your way into Data Science now\n\nHo...,https://medium.com/data-science-rush/freelance...,['Przemek Chojecki'],2019-12-09 18:38:31.365000+00:00,"['Work', 'Python', 'Data Science', 'Freelancin..."
