In [7]:
!pip3 install -r requirements.txt

Collecting nltk==3.9.1 (from -r requirements.txt (line 2))
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting matplotlib==3.10.6 (from -r requirements.txt (line 3))
  Downloading matplotlib-3.10.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting click (from nltk==3.9.1->-r requirements.txt (line 2))
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting joblib (from nltk==3.9.1->-r requirements.txt (line 2))
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk==3.9.1->-r requirements.txt (line 2))
  Downloading regex-2025.9.18-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting contourpy>=1.0.1 (from matplotlib==3.10.6->-r requirements.txt (line 3))
  Downloading contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib==3.10.6->-r r

In [19]:
import os
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from collections import Counter
import matplotlib.pyplot as plt
import random

# Dataset load (Hugging Face)

In [5]:
# Define the local directory where you want to save the data
local_path = os.path.join(os.getcwd(), "local_wikipedia_data")

# 1. Download and save the data to the specified local_path
ds = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",
    cache_dir=local_path  
)

print(f"Dataset successfully downloaded/loaded to: {local_path}")


Downloading data: 100%|██████████| 41/41 [17:32<00:00, 25.67s/files]
Generating train split: 100%|██████████| 6407814/6407814 [04:31<00:00, 23604.73 examples/s]


Dataset successfully downloaded/loaded to: /home/san22chit/Documents/IIITH/Sem3/IRE/Assignments/IndexingAndRetrieval/local_wikipedia_data


# Part 1

## Preprocessing (Hugging Face Dataset)

In [16]:
# Download NLTK resources if not already present
nltk.download('punkt_tab')
nltk.download('stopwords')


# Get English stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
punct_table = str.maketrans('', '', string.punctuation)

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(punct_table)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and stem
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and word.isalpha()]
    return tokens


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/san22chit/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/san22chit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def get_word_counts(split, preprocess_func=None, max_docs=10000):
    word_count = Counter()
    for idx, item in enumerate(ds[split]):
        text = item['text']
        if preprocess_func:
            tokens = preprocess_func(text)
        else:
            tokens = word_tokenize(text)
        word_count.update(tokens)
        if idx+1 >= max_docs:
            break
    return word_count

def plot_word_freq(counter, title, filename, top_n=30):
    most_common = counter.most_common(top_n)
    words, counts = zip(*most_common)
    plt.figure(figsize=(14,7))
    plt.bar(words, counts)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [None]:
split = 'train'
max_docs = 50000  # adjust as needed

# Original (no preprocessing)
print("Counting words without preprocessing...")
original_counts = get_word_counts(split, preprocess_func=None, max_docs=max_docs)
plot_word_freq(original_counts, "Word Frequency Before Preprocessing", "freq_before.png")

# Preprocessed
print("Counting words with preprocessing...")
processed_counts = get_word_counts(split, preprocess_func=preprocess, max_docs=max_docs)
plot_word_freq(processed_counts, "Word Frequency After Preprocessing", "freq_after.png")

print("Plots saved as freq_before.png and freq_after.png")

Counting words without preprocessing...
Counting words with preprocessing...
Plots saved as freq_before.png and freq_after.png
