<a href="https://colab.research.google.com/github/richapatel93/Data-Mining-Class/blob/main/Mapreducer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk -y
!apt-get install spark -y
!pip install -q findspark


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-8-jdk is already the newest version (8u422-b05-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package spark


In [None]:
import re
from pyspark import SparkContext

# Check if a SparkContext already exists and stop it if so
try:
    sc.stop()
except:
    pass

# Create a new SparkContext
sc = SparkContext("local", "Longest Words Example")

# Upload the text file
from google.colab import files
uploaded = files.upload()

# Load the text file into an RDD
file_name = list(uploaded.keys())[0]
rdd = sc.textFile(file_name)

# Define a function to clean and filter out unwanted characters
def clean_word(word):
    # Remove URLs and non-alphabetic characters
    word = re.sub(r'http\S+', '', word)  # Remove URLs
    word = re.sub(r'[^a-zA-Z]', '', word)  # Keep only alphabetic characters
    return word

# Split the text into words and clean them
words = rdd.flatMap(lambda line: re.findall(r'\b\w+\b', line)) \
           .map(lambda word: clean_word(word.lower())) \
           .filter(lambda word: len(word) > 1)  # Filter out very short words

# Remove duplicates
words = words.distinct()

# Find the length of each word and map them as (word, length)
word_lengths = words.map(lambda word: (word, len(word)))

# Collect all word-length pairs and sort by length (descending) to find the longest words
longest_words = word_lengths.takeOrdered(10, key=lambda x: -x[1])

# Display the top 10 longest words
for word, length in longest_words:
    print(f"Word: {word}, Length: {length}")


Saving the black.txt to the black (3).txt
Word: disnaturalisation, Length: 17
Word: circumnavigators, Length: 16
Word: circumnavigation, Length: 16
Word: chialinchechilin, Length: 16
Word: unenforceability, Length: 16
Word: procrastination, Length: 15
Word: notwithstanding, Length: 15
Word: contemporaneous, Length: 15
Word: perpendicularly, Length: 15
Word: circumnavigated, Length: 15


# Task 2: Finding and Listing the Most Frequent Bigrams

In [33]:
# Create bigrams by combining consecutive words
bigrams = words.zipWithIndex() \
               .map(lambda x: (x[1], x[0])) \
               .join(words.zipWithIndex().map(lambda x: (x[1] - 1, x[0]))) \
               .map(lambda x: (x[1][0], x[1][1])) \
               .filter(lambda bigram: len(bigram[0]) > 1 and len(bigram[1]) > 1)

# Count the frequency of each bigram
bigram_counts = bigrams.map(lambda bigram: (bigram, 1)) \
                       .reduceByKey(lambda a, b: a + b)

# Get the top 10 most frequent bigrams
top_bigrams = bigram_counts.takeOrdered(10, key=lambda x: -x[1])

# Display the top 10 bigrams with their counts
for bigram, count in top_bigrams:
    print(f"Bigram: {' '.join(bigram)}, Count: {count}")


Bigram: of the, Count: 1142
Bigram: to the, Count: 680
Bigram: in the, Count: 530
Bigram: and the, Count: 385
Bigram: on the, Count: 252
Bigram: from the, Count: 237
Bigram: to be, Count: 209
Bigram: that the, Count: 201
Bigram: and they, Count: 201
Bigram: with the, Count: 195


## Task 3: Customized Statistic

In [None]:
import re

# Function to clean words: convert to lowercase and remove non-alphanumeric characters
def clean_word(word):
    return re.sub(r'[^a-zA-Z0-9]', '', word.lower())

# Split the text into words and clean each word
words = rdd.flatMap(lambda line: line.split())

# Clean the words to remove punctuation and make them lowercase
cleaned_words = words.map(clean_word)

# Filter words that start with 'a'
words_starting_with_a = cleaned_words.filter(lambda word: word.startswith('a'))

# Count the occurrences of each cleaned word starting with 'a'
word_counts = words_starting_with_a.map(lambda word: (word, 1)) \
                                   .reduceByKey(lambda a, b: a + b)

# Filter words that appear more than once
frequent_words_with_a = word_counts.filter(lambda x: x[1] > 1)

# Get the top 10 most frequent words starting with 'a' that appear more than once
top_frequent_words_with_a = frequent_words_with_a.takeOrdered(10, key=lambda x: -x[1])

# Display the top frequent words starting with 'a' with their counts
for word, count in top_frequent_words_with_a:
    print(f"Word: {word}, Count: {count}")


Word: and, Count: 4821
Word: a, Count: 1928
Word: as, Count: 718
Word: at, Count: 562
Word: are, Count: 541
Word: all, Count: 364
Word: an, Count: 239
Word: after, Count: 174
Word: also, Count: 172
Word: another, Count: 159
