Write a python program to extract the contents (excluding any tags) from two 
websites
https://en.wikipedia.org/wiki/Artificial_intelligence
https://en.wikipedia.org/wiki/Machine_learning

Save the content in two separate files. Construct a trie based on the content retrieved 
in using HashMap / B-Tree / Dictionary. Write a program to show the implementation 
of Predictive Typing and Auto-Correct using the trie prepared.

# creating trie node structure

In [None]:
class TrieNode(): 
	def __init__(self): 
		self.children = {} 
		self.last = False

# creating Trie class which handles words inputting, suggesting and correcting

In [None]:
import nltk
class Trie(): 
  def __init__(self): 
    self.root = TrieNode() 
    self.word_list = [] 

  def formTrie(self, keys): 
    for key in keys: 
      self.insert(key)

  def insert(self, key): 
    node = self.root 

    for a in list(key): 
      if not node.children.get(a): 
        node.children[a] = TrieNode() 

      node = node.children[a] 

    node.last = True



  def suggestionsRec(self, node, word): 
    if node.last: 
      self.word_list.append(word) 

    for a,n in node.children.items(): 
      self.suggestionsRec(n, word + a) 

  def printAutoSuggestions(self, key):
    node = self.root 
    not_found = False
    temp_word = '' 
    self.word_list = [] 

    for a in list(key): 
      if not node.children.get(a): 
        not_found = True
        break

      temp_word += a 
      node = node.children[a] 

    if not_found: 
      return 0
    elif node.last and not node.children: 
      return -1

    self.suggestionsRec(node, temp_word) 

    for s in self.word_list: 
      print(s) 
    return 1

  def printAutoCorrect(self, key):
    node = self.root 
    temp_word = ''
    self.word_list = [] 

    for a in list(key): 
      if not node.children.get(a): 
        break

      temp_word += a 
      node = node.children[a] 

    self.suggestionsRec(node, temp_word) 

    for s in self.word_list:
      if nltk.edit_distance(key,s) <=3:
        print(s)

# Reading data from given websites and removing tags and stop words

In [None]:
import re
from bs4 import BeautifulSoup
from urllib import request

In [None]:
url1 = "https://en.wikipedia.org/wiki/Artificial_intelligence"
url2 = "https://en.wikipedia.org/wiki/Machine_learning"
html1 = request.urlopen(url1).read().decode('utf8')
html2 = request.urlopen(url2).read().decode('utf8')
raw1 = BeautifulSoup(html1, 'html.parser').get_text()
raw2 = BeautifulSoup(html2, 'html.parser').get_text()

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
words1 = word_tokenize(raw1)
words2 = word_tokenize(raw2)

In [None]:
filtered1 = []
filtered2 = []
nlp = spacy.load("en")

additional = ['.',',','\'','"','?','{','}','[',']','(',')','<','>','!']
for i in additional:
  nlp.Defaults.stop_words.add(i)


for i in words1:
  if nlp.vocab[i].is_stop == False:
    filtered1.append(i)

for i in words2:
  if nlp.vocab[i].is_stop == False:
    filtered2.append(i)

# Saving content into 2 different files

In [None]:
with open("index1.txt", "w") as output:
    for i in filtered1:
      output.write(str(i)+ "\n")

with open("index2.txt", "w") as output:
    for i in filtered2:
      output.write(str(i)+ "\n")

# Predictive Typing

In [None]:
key = "hel" 

t1 = Trie() 
t2 = Trie()

t1.formTrie(list(set(filtered1)))
t2.formTrie(list(set(filtered2)))

print("Aritificial intelligence")
comp1 = t1.printAutoSuggestions(key) 

if comp1 == -1: 
	print("No other strings found with this prefix\n") 
elif comp1 == 0: 
	print("No string found with this prefix\n") 
 

print("\nMachine Learning")
comp2 = t2.printAutoSuggestions(key) 

if comp2 == -1: 
	print("No other strings found with this prefix\n") 
elif comp2 == 0: 
	print("No string found with this prefix\n") 

Aritificial intelligence
help
helping
helps
helpful
held

Machine Learning
held
help


# Auto Correct

In [None]:
key = "machineee"
print("Aritificial intelligence")
t1.printAutoCorrect(key)  

print("\nMachine Learning")
t2.printAutoCorrect(key)  

Aritificial intelligence
machine
machines
machinery

Machine Learning
machine
machines
