In [61]:
import bs4 as bs
import urllib.request
import re
import nltk
import heapq
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [62]:
# Read the data from the web
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scraped_data.read()
parsed_article = bs.BeautifulSoup(article,'lxml')

#Convert the data into string
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
    article_text += p.text

### Cleaning of the data

In [64]:
def clean_text(raw_text):
    """
    Input: Raw Paragraph/Sentences/Text
    Output: Paragraph which doesn't contain references, special characters, digits and extra spaces
    
    """

    # Removing Square Brackets and Extra Spaces
    raw_text = re.sub(r'\[[0-9]*\]', ' ', raw_text)
    raw_text = re.sub(r'\s+', ' ', raw_text)

    # Removing special characters and digits
    formatted_article_text = re.sub('[^a-zA-Z]', ' ', raw_text )
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
    
    return formatted_article_text

In [65]:
# Checking our function
cleaned_text = clean_text(article_text)
cleaned_text

' Artificial intelligence AI is intelligence demonstrated by machines unlike the natural intelligence displayed by humans and animals Leading AI textbooks define the field as the study of intelligent agents any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals Colloquially the term artificial intelligence is often used to describe machines or computers that mimic cognitive functions that humans associate with the human mind such as learning and problem solving As machines become increasingly capable tasks considered to require intelligence are often removed from the definition of AI a phenomenon known as the AI effect A quip in Tesler s Theorem says AI is whatever hasn t been done yet For instance optical character recognition is frequently excluded from things considered to be AI having become a routine technology Modern machine capabilities generally classified as AI include successfully understanding human speech com

### Sentence Tokenization

In [66]:
sentence_list = nltk.sent_tokenize(article_text)

In [67]:
sentence_list[:5]

['\n\nArtificial intelligence (AI), is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals.',
 'Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.',
 '[3] Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving".',
 '[4]\nAs machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect.',
 '[5] A quip in Tesler\'s Theorem says "AI is whatever hasn\'t been done yet.']

### Word Frequency Count

In [68]:
def word_freq_count(text):
    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}
    for word in nltk.word_tokenize(text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
    return word_frequencies

In [69]:
# Displaying of all the word Frequencies
word_freq_count(formatted_article_text)

{'Artificial': 7,
 'intelligence': 73,
 'AI': 143,
 'demonstrated': 1,
 'machines': 23,
 'unlike': 5,
 'natural': 10,
 'displayed': 2,
 'humans': 25,
 'animals': 2,
 'Leading': 2,
 'textbooks': 1,
 'define': 1,
 'field': 17,
 'study': 6,
 'intelligent': 17,
 'agents': 8,
 'device': 2,
 'perceives': 2,
 'environment': 6,
 'takes': 3,
 'actions': 9,
 'maximize': 4,
 'chance': 4,
 'successfully': 6,
 'achieving': 3,
 'goals': 14,
 'Colloquially': 1,
 'term': 7,
 'artificial': 51,
 'often': 14,
 'used': 15,
 'describe': 3,
 'computers': 10,
 'mimic': 3,
 'cognitive': 5,
 'functions': 2,
 'associate': 1,
 'human': 57,
 'mind': 18,
 'learning': 28,
 'problem': 20,
 'solving': 8,
 'As': 2,
 'become': 7,
 'increasingly': 2,
 'capable': 8,
 'tasks': 5,
 'considered': 9,
 'require': 4,
 'removed': 1,
 'definition': 3,
 'phenomenon': 3,
 'known': 8,
 'effect': 3,
 'A': 29,
 'quip': 1,
 'Tesler': 1,
 'Theorem': 1,
 'says': 1,
 'whatever': 1,
 'done': 1,
 'yet': 1,
 'For': 10,
 'instance': 2,
 'opt

### Weighted Frequency Calculation

In [70]:
def weighted_freq(word_frequencies):
    maximum_frequncy = sum(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
    return word_frequencies

In [71]:
weighted_freq(word_frequencies)

{'Artificial': 0.0011982197877440075,
 'intelligence': 0.012495720643616079,
 'AI': 0.024477918521056158,
 'demonstrated': 0.0001711742553920011,
 'machines': 0.003937007874016025,
 'unlike': 0.0008558712769600055,
 'natural': 0.001711742553920011,
 'displayed': 0.0003423485107840022,
 'humans': 0.004279356384800027,
 'animals': 0.0003423485107840022,
 'Leading': 0.0003423485107840022,
 'textbooks': 0.0001711742553920011,
 'define': 0.0001711742553920011,
 'field': 0.0029099623416640186,
 'study': 0.0010270455323520065,
 'intelligent': 0.0029099623416640186,
 'agents': 0.0013693940431360088,
 'device': 0.0003423485107840022,
 'perceives': 0.0003423485107840022,
 'environment': 0.0010270455323520065,
 'takes': 0.0005135227661760033,
 'actions': 0.0015405682985280098,
 'maximize': 0.0006846970215680044,
 'chance': 0.0006846970215680044,
 'successfully': 0.0010270455323520065,
 'achieving': 0.0005135227661760033,
 'goals': 0.002396439575488015,
 'Colloquially': 0.0001711742553920011,
 'te

In [72]:
def sentence_scores_calc(word_frequencies,tokenized_sentence):
    sentence_scores = {}
    for sent in tokenized_sentence:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    
    return sentence_scores

In [73]:
sentence_scores = sentence_scores_calc(word_frequencies,sentence_list)
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
print(summary)



Artificial intelligence (AI), is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals. Musk also funds companies developing artificial intelligence such as DeepMind and Vicarious to "just keep an eye on what's going on with artificial intelligence. A superintelligence, hyperintelligence, or superhuman intelligence is a hypothetical agent that would possess intelligence far surpassing that of the brightest and most gifted human mind. [236] A February 2020 European Union white paper on artificial intelligence advocated for artificial intelligence for economic benefits, including "improving healthcare (e.g. [160][161]
Many of the problems in this article may also require general intelligence, if machines are to solve the problems as well as people do. [93]
The overall research goal of artificial intelligence is to create technology that allows computers and machines to function in an intelligent manner. [210] Research in this area includ