In [0]:
spark

In [0]:
pip install nltk

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords if not already downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Get NLTK English stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
%fs ls /FileStore/tables/wiki_small.csv

path,name,size,modificationTime
dbfs:/FileStore/tables/wiki_small.csv,wiki_small.csv,985338,1717040693000


In [0]:
sc= spark.sparkContext

In [0]:
#reading the wikipedia data, it contains the wiki page link and the text content of that page

wiki= sc.textFile("/FileStore/tables/wiki_small.csv")

#seeing the entries of the data
wiki.take(3)

Out[3]: ['url,text',
 'https://simple.wikipedia.org/wiki/Carl%20Vinson,Carl Vinson November    June   was an American politician. He served as a Representative from Georgia. He was a Democrat. He was the first person to serve for more than  years in the United States House of Representatives. He was known as The Father of the TwoOcean Navy.Other websites University of Georgia brief biography  University of Georgia Profile of Carl Vinson  Congressional Biography entry in the New Georgia Encyclopedia   United States Navy website  USS CARL VINSON  Mercer University Press Carl Vinson Patriarch of the Armed Forces   Vinson Institute Press Carl Vinson  A Legacy of Public Service   births deathsDeans of the United States House of RepresentativesUnited States representatives from GeorgiaUS Democratic Party politiciansthcentury American politicians',
 'https://simple.wikipedia.org/wiki/Oberhasli%20%28district%29,The district of Oberhasli in the Swiss canton of Bern has  municipalities in an are

In [0]:
wiki.first()

Out[4]: 'url,text'

In [0]:
#there are these many entries
wiki.count()

Out[5]: 1001

In [0]:
#function to clean the text in the data
def clean_text(text):
    text = re.sub(r'\.', ' ', text)
    # Remove punctuation and numbers
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    
    # Remove stopwords
    cleaned_text = ' '.join(word.lower() for word in cleaned_text.split() if word.lower() not in stop_words)
    
    return cleaned_text

In [0]:
#testing the clean text function

clean_text("Carl Vinson November78 $&*##   June   was an American politician. He served as a Representative from Georgia. He was a Democrat. He was the first person to serve for more than  years in the United States House of Representatives.")

Out[7]: 'carl vinson november june american politician served representative georgia democrat first person serve years united states house representatives'

In [0]:
# Applying the clean text function to wiki data
wiki = wiki.map(lambda x: x.split(',')[0] + ',' + clean_text(x.split(',')[1]))
wiki.take(4)

Out[8]: ['url,text',
 'https://simple.wikipedia.org/wiki/Carl%20Vinson,carl vinson november june american politician served representative georgia democrat first person serve years united states house representatives known father twoocean navy websites university georgia brief biography university georgia profile carl vinson congressional biography entry new georgia encyclopedia united states navy website uss carl vinson mercer university press carl vinson patriarch armed forces vinson institute press carl vinson legacy public service births deathsdeans united states house representativesunited states representatives georgiaus democratic party politiciansthcentury american politicians',
 'https://simple.wikipedia.org/wiki/Oberhasli%20%28district%29,district oberhasli swiss canton bern municipalities area km oberhasli',
 'https://simple.wikipedia.org/wiki/Norman%2C%20Arkansas,norman town us state arkansas towns arkansas']

In [0]:
#function to tokenize the data

def tokenize (line):
    url, text = line.split(",")

    return (url, text)

In [0]:
wiki_recs=wiki.map(lambda line: tokenize(line))
#wiki_recs=wiki.map(tokenize)   --- same 

In [0]:
wiki_recs
wiki_recs.take(3)

Out[11]: [('url', 'text'),
 ('https://simple.wikipedia.org/wiki/Carl%20Vinson',
  'carl vinson november june american politician served representative georgia democrat first person serve years united states house representatives known father twoocean navy websites university georgia brief biography university georgia profile carl vinson congressional biography entry new georgia encyclopedia united states navy website uss carl vinson mercer university press carl vinson patriarch armed forces vinson institute press carl vinson legacy public service births deathsdeans united states house representativesunited states representatives georgiaus democratic party politiciansthcentury american politicians'),
 ('https://simple.wikipedia.org/wiki/Oberhasli%20%28district%29',
  'district oberhasli swiss canton bern municipalities area km oberhasli')]

In [0]:
#unigram function
def unigram(url, text, n=1):
    tokens= text.split()
    return [(tok.lower(), url) for tok in tokens ]


In [0]:
def trigrams(url, text):
    tokens = text.split()
    trigrams = [(tokens[i].lower(), tokens[i+1].lower(), tokens[i+2].lower(),url) for i in range(len(tokens)-2)]
    return trigrams


In [0]:
unigram("url1", "This is a test case for this data")

Out[14]: [('this', 'url1'),
 ('is', 'url1'),
 ('a', 'url1'),
 ('test', 'url1'),
 ('case', 'url1'),
 ('for', 'url1'),
 ('this', 'url1'),
 ('data', 'url1')]

In [0]:
trigrams("url1", "This is a test case for this data")

Out[15]: [('this', 'is', 'a', 'url1'),
 ('is', 'a', 'test', 'url1'),
 ('a', 'test', 'case', 'url1'),
 ('test', 'case', 'for', 'url1'),
 ('case', 'for', 'this', 'url1'),
 ('for', 'this', 'data', 'url1')]

In [0]:
wiki_index= wiki_recs.flatMap(lambda rec: trigrams(rec[0], rec[1]))

In [0]:
#wiki_index= wiki_recs.flatMap(lambda rec: bigram(rec[0], rec[1]))
#wiki_index.take(10)

In [0]:
wiki_index.take(10)

Out[18]: [('carl',
  'vinson',
  'november',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('vinson',
  'november',
  'june',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('november',
  'june',
  'american',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('june',
  'american',
  'politician',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('american',
  'politician',
  'served',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('politician',
  'served',
  'representative',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('served',
  'representative',
  'georgia',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('representative',
  'georgia',
  'democrat',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('georgia',
  'democrat',
  'first',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('democrat',
  'first',
  'person',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson')]

In [0]:
wiki_index.count()

Out[19]: 88968

In [0]:
header= wiki_index.first()

In [0]:
wiki_index= wiki_index.filter(lambda rec: rec!=header)
wiki_index.take(2)

Out[21]: [('vinson',
  'november',
  'june',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson'),
 ('november',
  'june',
  'american',
  'https://simple.wikipedia.org/wiki/Carl%20Vinson')]

In [0]:
wiki_index = wiki_index.map(lambda x: (x[0]+' '+x[1]+' '+x[2]   ,x[3]))

In [0]:
wiki_inverted_index= wiki_index.groupByKey()
wiki_inverted_index.take(2)

Out[23]: [('american politician served',
  <pyspark.resultiterable.ResultIterable at 0x7f22629ab9a0>),
 ('representative georgia democrat',
  <pyspark.resultiterable.ResultIterable at 0x7f22629abfd0>)]

In [0]:
#group the URLs by the key (word)
wiki_inverted_index= wiki_index.groupByKey().mapValues(list)

In [0]:
wiki_inverted_index.take(1)

Out[25]: [('american politician served',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson',
   'https://simple.wikipedia.org/wiki/Greg%20Evers'])]

In [0]:
wiki_inverted_index= wiki_index.groupByKey().mapValues(set)
wiki_inverted_index.take(1)

Out[26]: [('american politician served',
  {'https://simple.wikipedia.org/wiki/Carl%20Vinson',
   'https://simple.wikipedia.org/wiki/Greg%20Evers'})]

In [0]:
wiki_inverted_index= (  wiki_inverted_index
                      .map(lambda rec: (rec[0], list(rec[1])   )) )

In [0]:
wiki_inverted_index.take(20)

Out[28]: [('american politician served',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson',
   'https://simple.wikipedia.org/wiki/Greg%20Evers']),
 ('representative georgia democrat',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('georgia democrat first',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('democrat first person',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('first person serve', ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('years united states', ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('united states house',
  ['https://simple.wikipedia.org/wiki/Cori%20Bush',
   'https://simple.wikipedia.org/wiki/Carl%20Vinson',
   'https://simple.wikipedia.org/wiki/Los%20Angeles%20County%2C%20California']),
 ('house representatives known',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('navy websites university',
  ['https://simple.wikipedia.org/wiki/Carl%20Vinson']),
 ('websites university georgia',
  ['https://s

In [0]:
filepath=("/FileStore/tables/wiki_small.csv")

In [0]:
wiki_info= wiki_inverted_index.collect()
display(wiki_info)

_1,_2
american politician served,"List(https://simple.wikipedia.org/wiki/Carl%20Vinson, https://simple.wikipedia.org/wiki/Greg%20Evers)"
representative georgia democrat,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
georgia democrat first,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
democrat first person,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
first person serve,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
years united states,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
united states house,"List(https://simple.wikipedia.org/wiki/Cori%20Bush, https://simple.wikipedia.org/wiki/Carl%20Vinson, https://simple.wikipedia.org/wiki/Los%20Angeles%20County%2C%20California)"
house representatives known,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
navy websites university,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
websites university georgia,List(https://simple.wikipedia.org/wiki/Carl%20Vinson)
