In [1]:
from collections import defaultdict
import glob

# Step 1 – Mapper Function
def mapper(file_path, stopwords):
    mapped = []
    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            words = line.strip().split()
            for word in words:
                w = word.lower().strip(",.?!:;\"'()[]{}")  # clean punctuation
                if w and w not in stopwords and len(w) > 5:  # ignore stopwords + only >5 chars
                    mapped.append((w, 1))
    return mapped

# Step 2 – Reducer Function
def reducer(mapped_data):
    reduced = defaultdict(int)
    for word, count in mapped_data:
        reduced[word] += count
    return reduced

if __name__ == "__main__":
    # Define stopwords
    stopwords = {"the", "and", "of", "to", "a", "in", "is", "it", "that", "for", "on", "with", "as", "by", "at"}

    # Step 3 – Process multiple text files
    all_files = glob.glob("*.txt")  # pick all .txt files in current dir
    mapped_data = []
    for file_path in all_files:
        mapped_data.extend(mapper(file_path, stopwords))

    # Step 4 – Reduce phase
    reduced_data = reducer(mapped_data)

    # Sort words by frequency
    sorted_words = sorted(reduced_data.items(), key=lambda x: x[1], reverse=True)


    least_frequent = sorted(reduced_data.items(), key=lambda x: x[1])[:5]
    print("\nTop 5 Least Frequent Words:")
    for word, freq in least_frequent:
        print(f"{word}: {freq}")



Top 5 Least Frequent Words:
barked: 1
chased: 1
hadoop_0: 1000
hadoop_1: 1000
hadoop_2: 1000
