In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Read the input file
df = pd.read_excel('/content/Input.xlsx')

# Loop through each URL and extract the article text
for index, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract the article title
    article_title = soup.title.string
    # Extract the article text
    article_text = ''
    article_tags = soup.find_all('p')
    for tag in article_tags:
        article_text += tag.get_text()
    # Save the extracted article text in a text file
    with open(f'{url_id}.txt', 'w') as f:
        f.write(article_title + '\n' + article_text)

In [11]:
!pip install textstat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
#Data Analysis
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease, textstat
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

# Read the output structure file
output_df = pd.read_excel('/content/Output Data Structure.xlsx')

# Loop through each URL and extract the article text
for index, row in output_df.iterrows():
    url_id = row['URL_ID']
    with open(f'{url_id}.txt', 'r') as f:
        text = f.read()
    # Extract the article title and article text
    article_title = text.split('\n')[0]
    article_text = ' '.join(text.split('\n')[1:])
    # Perform textual analysis on the article text
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(article_text)
    positive_score = sentiment['pos']
    negative_score = sentiment['neg']
    polarity_score = sentiment['compound']
    subjectivity_score = 1 - polarity_score
    sentences = nltk.sent_tokenize(article_text)
    sentence_lengths = [len(nltk.word_tokenize(sentence)) for sentence in sentences]
    if sentence_lengths:
        avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths)
    else:
        avg_sentence_length = 0
    words = nltk.word_tokenize(article_text)
    word_count = len(words)
    if word_count:
        complex_words = [word for word in words if textstat.syllable_count(word) > 2]
        complex_word_count = len(complex_words)
        percentage_complex_words = complex_word_count / word_count * 100
    else:
        complex_word_count = 0
        percentage_complex_words = 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    syllable_count = textstat.syllable_count(article_text)
    if len(sentence_lengths) > 0:
      avg_words_per_sentence = word_count / len(sentence_lengths)
    else:
      avg_words_per_sentence = 0
    personal_pronouns = sum([1 for word in nltk.pos_tag(words) if word[1] == 'PRP'])
    if word_count > 0:
      avg_word_length = sum(len(word) for word in words) / word_count
    else:
      avg_word_length = 0
    # Compute the required variables based on the analysis
    output_df.loc[index, 'Positive Score'] = positive_score
    output_df.loc[index, 'Negative Score'] = negative_score
    output_df.loc[index, 'Polarity Score'] = polarity_score
    output_df.loc[index, 'Subjectivity Score'] = subjectivity_score
    output_df.loc[index, 'Avg Sentence Length'] = avg_sentence_length
    output_df.loc[index, 'Percentage of Complex Words'] = percentage_complex_words
    output_df.loc[index, 'FOG Index'] = fog_index
    output_df.loc[index, 'Complex Word Count'] = complex_word_count
    output_df.loc[index, 'Word Count'] = word_count
    if word_count > 0:
      output_df.loc[index, 'Syllable per Word'] = syllable_count / word_count
    else:
      output_df.loc[index, 'Syllable per Word'] = 0
    output_df.loc[index, 'Personal Pronouns'] = personal_pronouns
    output_df.loc[index, 'Avg Word Length'] = avg_word_length
    output_df.to_excel('/content/Output Data Structure.xlsx', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
