In [8]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import nltk
from nltk.corpus import cmudict
from textblob import TextBlob

# Download NLTK resources (needed for tokenization and cmudict)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [9]:
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract only article title and text, remove tags and unnecessary parts
        title = soup.title.text.strip() if soup.title else ""
        article_text = ""
        for paragraph in soup.find_all('p'):
            article_text += paragraph.text + " "

        # Remove extra spaces and newlines
        article_text = re.sub(r'\s+', ' ', article_text).strip()

        return title, article_text
    except Exception as e:
        print(f"Error extracting text from URL: {url}")
        print(e)
        return "", ""


In [10]:
# Load input data
input_data = pd.read_excel('Input.xlsx')


In [11]:
# Create an empty DataFrame to store the output
output_data = pd.DataFrame(columns=[
    'URL_ID', 'URL', 'TITLE', 'ARTICLE_TEXT',
    'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
    'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
    'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])


Process URLs and Perform Text Analysis (Segmented)

In [13]:
# Process each URL
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract article text from URL
    title, article_text = extract_article_text(url)

    # Text analysis
    blob = TextBlob(article_text)
    word_count = len(blob.words)
    sentence_count = len(blob.sentences)


Calculate Syllable Count

In [14]:
    # Count syllables using cmudict
    syllable_count = sum([len(list(cmudict.dict().get(word.lower(), []))) for word in blob.words])


Calculate Complex Word Count and Percentage

In [15]:
    # Count complex words (words longer than 6 characters)
    complex_word_count = len([word for word in blob.words if len(word) > 6])

    # Calculate percentage of complex words
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count > 0 else 0


Calculate FOG Index

In [16]:
    # Calculate FOG index
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)


Count Personal Pronouns

In [17]:
    # Count personal pronouns (PRP tags)
    personal_pronouns = sum(1 for word, pos in blob.tags if pos == 'PRP')


Calculate Average Word Length and Syllables per Word

In [18]:
    # Calculate average word length
    avg_word_length = sum(len(word) for word in blob.words) / word_count if word_count > 0 else 0

    # Calculate syllables per word
    syllable_per_word = syllable_count / word_count if word_count > 0 else 0


Calculate Polarity and Subjectivity Scores

In [19]:
    # Calculate polarity and subjectivity scores
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity


Save Data to Output DataFrame

In [21]:
# Save data to output DataFrame
new_row = {
    'URL_ID': url_id,
    'URL': url,
    'TITLE': title,
    'ARTICLE_TEXT': article_text,
    'POSITIVE SCORE': polarity_score,
    'NEGATIVE SCORE': -polarity_score,  # We can use the negative of polarity as negative score
    'POLARITY SCORE': polarity_score,
    'SUBJECTIVITY SCORE': subjectivity_score,
    'AVG SENTENCE LENGTH': avg_sentence_length,
    'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
    'FOG INDEX': fog_index,
    'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
    'COMPLEX WORD COUNT': complex_word_count,
    'WORD COUNT': word_count,
    'SYLLABLE PER WORD': syllable_per_word,
    'PERSONAL PRONOUNS': personal_pronouns,
    'AVG WORD LENGTH': avg_word_length
}

# Convert the new row to a DataFrame
new_row_df = pd.DataFrame([new_row])

# Concatenate the new row DataFrame with the output DataFrame
output_data = pd.concat([output_data, new_row_df], ignore_index=True)


Save Output to Excel File

In [22]:
# Save output to Excel file
output_data.to_excel('Output.xlsx', index=False)


**Approach for Data Extraction and Text Analysis**

**Data Extraction:**

**Input Data**:
* Read the input data from "Input.xlsx" containing article URLs.
* Each row represents an article with columns like URL_ID, URL, etc.

**Data Extraction**:
* Use Python libraries like BeautifulSoup, requests, or Scrapy for web scraping.
* For each URL in the input data:
* Fetch the webpage content.
* Use HTML parsing to extract the article title and text.
* Avoid extracting header, footer, and other unwanted elements.
* Save the extracted article text in a text file with URL_ID as the filename.

**Text Analysis**:

**Input Data**:

* Utilize the extracted article texts for analysis.
* Textual Analysis:
* Perform the following text analysis tasks for each article:
* Tokenization: Split text into sentences and words.

**Sentiment Analysis:**

* Calculate Positive Score and Negative Score based on predefined positive and negative word lists.
* Compute Polarity Score as (Positive Score - Negative Score) / (Positive Score + Negative Score + 1e-9).
* Determine Subjectivity Score based on the ratio of Polarity Score to maximum possible Polarity.

**Sentence Analysis:**
* Calculate Average Sentence Length by dividing the total number of words by the number of sentences.
* Identify Complex Words based on predefined criteria.
* Compute Percentage of Complex Words as (Complex Word Count / Total Word Count)  100.
~ Determine FOG Index as 0.4 * (Average Sentence Length + Percentage of Complex Words).

**Word Analysis**:

* Calculate Average Number of Words Per Sentence.
* Compute Complex Word Count based on predefined criteria.
* Determine Word Count.
* Calculate Syllables Per Word using syllable count algorithms.
* Identify Personal Pronouns (I, me, my, etc.).
* Compute Average Word Length.

**Output Creation:**

* Prepare the output data structure as per the "Output Data Structure.xlsx" file.
* Each row will represent an article with its URL_ID and extracted variables.
* Write the computed variables for each article into the respective columns.