In [39]:
!pip3 install -r requirements.txt

Collecting elasticsearch==8.11.0 (from -r requirements.txt (line 4))
  Using cached elasticsearch-8.11.0-py3-none-any.whl.metadata (5.1 kB)
Using cached elasticsearch-8.11.0-py3-none-any.whl (412 kB)
Installing collected packages: elasticsearch
Successfully installed elasticsearch-8.11.0


In [1]:
import os
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from collections import Counter
import matplotlib.pyplot as plt
import random
from datasets import load_dataset
from elasticsearch import Elasticsearch, helpers
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Dataset load (Hugging Face)

In [2]:
# Define the local directory where you want to save the data
local_path = os.path.join(os.getcwd(), "local_wikipedia_data")

# 1. Download and save the data to the specified local_path
ds = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",
    cache_dir=local_path  
)

print(f"Dataset successfully downloaded/loaded to: {local_path}")


Dataset successfully downloaded/loaded to: /home/san22chit/Documents/IIITH/Sem3/IRE/Assignments/IndexingAndRetrieval/local_wikipedia_data


# Part 1

## Preprocessing (Hugging Face Dataset)

In [3]:
# Download NLTK resources if not already present
nltk.download('punkt_tab')
nltk.download('stopwords')


# Get English stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
punct_table = str.maketrans('', '', string.punctuation)

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(punct_table)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and stem
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and word.isalpha()]
    return tokens


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/san22chit/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/san22chit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def get_word_counts(split, preprocess_func=None, max_docs=10000):
    word_count = Counter()
    for idx, item in enumerate(ds[split]):
        text = item['text']
        if preprocess_func:
            tokens = preprocess_func(text)
        else:
            tokens = word_tokenize(text)
        word_count.update(tokens)
        if idx+1 >= max_docs:
            break
    return word_count

def plot_word_freq(counter, title, filename, top_n=30):
    most_common = counter.most_common(top_n)
    words, counts = zip(*most_common)
    plt.figure(figsize=(14,7))
    plt.bar(words, counts)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [5]:
split = 'train'
max_docs = 10000  # adjust as needed

# Original (no preprocessing)
print("Counting words without preprocessing...")
original_counts = get_word_counts(split, preprocess_func=None, max_docs=max_docs)
plot_word_freq(original_counts, "Word Frequency Before Preprocessing", "freq_before.png")

# Preprocessed
print("Counting words with preprocessing...")
processed_counts = get_word_counts(split, preprocess_func=preprocess, max_docs=max_docs)
plot_word_freq(processed_counts, "Word Frequency After Preprocessing", "freq_after.png")

print("Plots saved as freq_before.png and freq_after.png")

Counting words without preprocessing...
Counting words with preprocessing...
Plots saved as freq_before.png and freq_after.png


In [6]:
# Complete the word frequency analysis (add this after your current code)
import requests
import json
from pathlib import Path

# Wait for the frequency counting to complete, then run this:
print("Word frequency analysis completed!")
print(f"Original vocabulary size: {len(original_counts)}")
print(f"Processed vocabulary size: {len(processed_counts)}")

# Show top words before and after preprocessing
print("\nTop 10 words before preprocessing:")
for word, count in original_counts.most_common(10):
    print(f"  {word}: {count}")

print("\nTop 10 words after preprocessing:")
for word, count in processed_counts.most_common(10):
    print(f"  {word}: {count}")

# Create directory for results
Path("results").mkdir(exist_ok=True)

# Save the word frequency data
with open("results/original_word_freq.json", "w") as f:
    json.dump(dict(original_counts.most_common(1000)), f)

with open("results/processed_word_freq.json", "w") as f:
    json.dump(dict(processed_counts.most_common(1000)), f)

print("Word frequency data saved to results/ directory")

Word frequency analysis completed!
Original vocabulary size: 449902
Processed vocabulary size: 275975

Top 10 words before preprocessing:
  ,: 915597
  the: 870181
  .: 614617
  of: 523116
  and: 423815
  in: 337785
  to: 287387
  a: 258722
  ): 184559
  (: 184501

Top 10 words after preprocessing:
  also: 38234
  use: 36985
  one: 29762
  first: 29353
  state: 27724
  new: 24421
  includ: 24019
  year: 23205
  american: 22633
  time: 21475
Word frequency data saved to results/ directory


## Elastic Search Indexing

In [10]:
!docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.11.0

docker: Error response from daemon: Conflict. The container name "/elasticsearch" is already in use by container "b0329fdcf3494fc3c0b96d0edb31a51534e9a5941e452ec768f446a16842b767". You have to remove (or rename) that container to be able to reuse that name.

Run 'docker run --help' for more information
