**Data Extraction and NLP**

The objective of this assignment is to extract textual data articles from the given URL and perform text analysis to compute variables that are explained below.

In [2]:
!pip install pandas requests beautifulsoup4 nltk textblob syllapy


Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [22]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import re
import syllapy

In [15]:
# Download VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
# Load input data
input_df = pd.read_excel("/content/Input.xlsx")

In [6]:
# Function to extract text from URL
def extract_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        article = soup.find('article')
        text = " ".join([p.get_text() for p in article.find_all('p')])
        return text
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None

In [7]:
# Function to calculate sentiment scores
def calculate_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    polarity_score = sia.polarity_scores(text)['compound']
    subjectivity_score = TextBlob(text).sentiment.subjectivity
    positive_score = len(re.findall(r'\b(good|great|excellent|positive)\b', text, flags=re.IGNORECASE))
    negative_score = len(re.findall(r'\b(bad|poor|negative)\b', text, flags=re.IGNORECASE))
    return positive_score, negative_score, polarity_score, subjectivity_score

In [8]:
# Function to calculate readability metrics
def calculate_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    total_words = len(words)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Count complex words
    complex_words = [word for word in filtered_words if syllapy.count(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / total_words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_word_length = sum(len(word) for word in filtered_words) / len(filtered_words)

    return avg_sentence_length, percentage_complex_words, fog_index, avg_word_length

In [9]:
# Function to count personal pronouns
def count_personal_pronouns(text):
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, flags=re.IGNORECASE))
    return personal_pronouns


In [10]:
# Function to count syllables per word
def count_syllables_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(syllapy.count(word) for word in words)
    syllable_per_word = syllable_count / len(words)
    return syllable_per_word

In [11]:
# Function to process each URL and compute variables
def process_url(url_id, url):
    text = extract_text(url)
    if text:
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment(text)
        avg_sentence_length, percentage_complex_words, fog_index, avg_word_length = calculate_readability(text)
        personal_pronouns = count_personal_pronouns(text)
        syllable_per_word = count_syllables_per_word(text)

        return {
            "URL_ID": url_id,
            "URL": url,
            "POSITIVE SCORE": positive_score,
            "NEGATIVE SCORE": negative_score,
            "POLARITY SCORE": polarity_score,
            "SUBJECTIVITY SCORE": subjectivity_score,
            "AVG SENTENCE LENGTH": avg_sentence_length,
            "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
            "FOG INDEX": fog_index,
            "AVG NUMBER OF WORDS PER SENTENCE": len(word_tokenize(text)) / len(sent_tokenize(text)),
            "COMPLEX WORD COUNT": len([word for word in word_tokenize(text) if syllapy.count(word) > 2]),
            "WORD COUNT": len(word_tokenize(text)),
            "SYLLABLE PER WORD": syllable_per_word,
            "PERSONAL PRONOUNS": personal_pronouns,
            "AVG WORD LENGTH": avg_word_length
        }
    else:
        return None


In [23]:
# Process each URL and compute variables
output_data = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    data = process_url(url_id, url)  # This line is causing the issue
    if data:
        output_data.append(data)

Error extracting text from https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 'NoneType' object has no attribute 'find_all'
Error extracting text from https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 'NoneType' object has no attribute 'find_all'


In [24]:
# Create DataFrame for output data
output_df = pd.DataFrame(output_data)

In [25]:
# Save output to Excel file
output_df.to_excel("output.xlsx", index=False)