## practical 7

In [2]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import ne_chunk, pos_tag

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Step 1: Fetch the Web Page Content
def fetch_webpage_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Extract text from all paragraph tags
            page_text = ' '.join([p.text for p in soup.find_all('p')])
            return page_text
        else:
            print(f"Error: Unable to fetch the webpage. Status code {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


In [3]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Remove punctuation and non-alphabetic characters
    words = [word for word in tokens if word.isalpha()]
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    return words


In [4]:
def identify_topics(words, num_topics=10):
    fdist = FreqDist(words)
    common_words = fdist.most_common(num_topics)
    return common_words

In [5]:
def named_entity_recognition(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    ner_tree = ne_chunk(pos_tags, binary=False)
    return ner_tree

In [6]:
if __name__ == '__main__':
    # URL of the web page to analyze
    url = 'https://en.wikipedia.org/wiki/Natural_language_processing'

    # Step 1: Fetch Web Page Content
    page_content = fetch_webpage_content(url)
    if page_content:
        print("Web Page Content Fetched Successfully!")

        # Step 2: Preprocess the Text
        processed_text = preprocess_text(page_content)
        print(f"Processed Text Sample: {processed_text[:20]}")  # Printing a sample of the processed text

        # Step 3: Identify Topics (Word Frequency)
        topics = identify_topics(processed_text)
        print("\nMost Common Topics Based on Word Frequency:")
        for word, freq in topics:
            print(f"{word}: {freq}")

        # Step 4: Named Entity Recognition (NER)
        print("\nNamed Entities in the Web Page:")
        ner_result = named_entity_recognition(page_content)
        ner_result.pprint()  # Pretty print the named entities
    else:
        print("Failed to fetch the web page content.")


Web Page Content Fetched Successfully!
Processed Text Sample: ['natural', 'language', 'processing', 'nlp', 'subfield', 'computer', 'science', 'especially', 'artificial', 'intelligence', 'primarily', 'concerned', 'providing', 'computers', 'ability', 'process', 'data', 'encoded', 'natural', 'language']

Most Common Topics Based on Word Frequency:
language: 20
nlp: 15
natural: 14
cognitive: 13
processing: 11
linguistics: 10
tasks: 10
approach: 9
statistical: 8
intelligence: 6

Named Entities in the Web Page:
(S
  Natural/JJ
  language/NN
  processing/NN
  (/(
  (ORGANIZATION NLP/NNP)
  )/)
  is/VBZ
  a/DT
  subfield/NN
  of/IN
  computer/NN
  science/NN
  and/CC
  especially/RB
  artificial/JJ
  intelligence/NN
  ./.
  It/PRP
  is/VBZ
  primarily/RB
  concerned/VBN
  with/IN
  providing/VBG
  computers/NNS
  with/IN
  the/DT
  ability/NN
  to/TO
  process/VB
  data/NNS
  encoded/VBN
  in/IN
  natural/JJ
  language/NN
  and/CC
  is/VBZ
  thus/RB
  closely/RB
  related/JJ
  to/TO
  informat

In [7]:
def identify_topics(words):
    fdist = FreqDist(words)
    return fdist.most_common(10)  # Return the 10 most common words


In [9]:
def named_entity_recognition(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)  # Part-of-speech tagging
    ner_tree = ne_chunk(pos_tags)  # Named Entity Recognition
    return ner_tree


In [10]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import ne_chunk, pos_tag

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Step 1: Fetch the Web Page Content
def fetch_webpage_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Extract text from all paragraph tags
            page_text = ' '.join([p.text for p in soup.find_all('p')])
            return page_text
        else:
            print(f"Error: Unable to fetch the webpage. Status code {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Step 2: Preprocess the Text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Remove punctuation and non-alphabetic characters
    words = [word for word in tokens if word.isalpha()]
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    return words

# Step 3: Identify Topics (Word Frequency Analysis)
def identify_topics(words):
    fdist = FreqDist(words)
    return fdist.most_common(10)  # Return the 10 most common words

# Step 4: Named Entity Recognition (NER) using NLTK
def named_entity_recognition(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)  # Part-of-speech tagging
    ner_tree = ne_chunk(pos_tags)  # Named Entity Recognition
    return ner_tree

if __name__ == '__main__':
    # URL of the web page to analyze
    url = 'https://en.wikipedia.org/wiki/Natural_language_processing'

    # Step 1: Fetch Web Page Content
    page_content = fetch_webpage_content(url)
    
    if page_content:
        print("Web Page Content Fetched Successfully!")
        
        # Step 2: Preprocess the Text
        processed_text = preprocess_text(page_content)
        print(f"Processed Text Sample: {processed_text[:20]}")  # Printing a sample of the processed text
        
        # Step 3: Identify Topics (Word Frequency)
        topics = identify_topics(processed_text)
        print("\nMost Common Topics Based on Word Frequency:")
        for word, freq in topics:
            print(f"{word}: {freq}")
        
        # Step 4: Named Entity Recognition (NER)
        print("\nNamed Entities in the Web Page:")
        ner_result = named_entity_recognition(page_content)
        ner_result.pprint()  # Pretty print the named entities
    else:
        print("Failed to fetch the web page content.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Web Page Content Fetched Successfully!
Processed Text Sample: ['natural', 'language', 'processing', 'nlp', 'subfield', 'computer', 'science', 'especially', 'artificial', 'intelligence', 'primarily', 'concerned', 'providing', 'computers', 'ability', 'process', 'data', 'encoded', 'natural', 'language']

Most Common Topics Based on Word Frequency:
language: 20
nlp: 15
natural: 14
cognitive: 13
processing: 11
linguistics: 10
tasks: 10
approach: 9
statistical: 8
intelligence: 6

Named Entities in the Web Page:
(S
  Natural/JJ
  language/NN
  processing/NN
  (/(
  (ORGANIZATION NLP/NNP)
  )/)
  is/VBZ
  a/DT
  subfield/NN
  of/IN
  computer/NN
  science/NN
  and/CC
  especially/RB
  artificial/JJ
  intelligence/NN
  ./.
  It/PRP
  is/VBZ
  primarily/RB
  concerned/VBN
  with/IN
  providing/VBG
  computers/NNS
  with/IN
  the/DT
  ability/NN
  to/TO
  process/VB
  data/NNS
  encoded/VBN
  in/IN
  natural/JJ
  language/NN
  and/CC
  is/VBZ
  thus/RB
  closely/RB
  related/JJ
  to/TO
  informat