In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import re

# Function to extract text from a URL
def extract_text_from_url(url):
    try:
        # Request the URL
        response = requests.get(url)
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the article title
        title = soup.title.text.strip()
        # Find all paragraph tags within the article
        paragraphs = soup.find_all('p')
        # Concatenate the text from all paragraphs
        text = ' '.join([p.text for p in paragraphs])
        return title, text
    except Exception as e:
        print(f"An error occurred while extracting text from {url}: {e}")
        return None, None

# Function to perform text analysis
def perform_text_analysis(text):
    try:
        # Create a TextBlob object
        blob = TextBlob(text)
        # Sentiment analysis
        polarity_score = blob.sentiment.polarity
        subjectivity_score = blob.sentiment.subjectivity
        # Tokenization
        words = blob.words
        sentences = blob.sentences
        # Calculate average sentence length
        avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
        # Calculate percentage of complex words
        complex_words = [word for word in words if syllable_count(word) > 2]
        percentage_complex_words = (len(complex_words) / len(words)) * 100
        # Calculate Fog index
        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
        # Calculate average number of words per sentence
        avg_words_per_sentence = len(words) / len(sentences)
        # Calculate complex word count
        complex_word_count = len(complex_words)
        # Word count
        word_count = len(words)
        # Calculate syllable per word
        syllable_per_word = sum(syllable_count(word) for word in words) / len(words)
        # Personal pronouns count
        personal_pronouns = count_personal_pronouns(text)
        # Average word length
        avg_word_length = sum(len(word) for word in words) / len(words)
        # Positive and Negative score
        positive_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity > 0)
        negative_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity < 0)
        return polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, \
               fog_index, avg_words_per_sentence, complex_word_count, word_count, syllable_per_word, \
               personal_pronouns, avg_word_length, positive_score, negative_score
    except Exception as e:
        print(f"An error occurred during text analysis: {e}")
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None

# Function to count syllables in a word
def syllable_count(word):
    vowels = 'aeiouy'
    word = word.lower()
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count += 1
    return count

# Function to count personal pronouns in text
def count_personal_pronouns(text):
    pronouns = ['I', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves']
    count = sum(1 for word in re.findall(r'\b\S+\b', text) if word.lower() in pronouns)
    return count

# Load URLs from input file
input_file = r'C:\Users\dhima\Desktop\Input.xlsx'
df_urls = pd.read_excel(input_file)

# Initialize lists to store output data
output_data = []

# Iterate through each URL
for index, row in df_urls.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    print(f"Processing {url_id}...")
    # Extract text from the URL
    title, text = extract_text_from_url(url)
    if title and text:
        # Perform text analysis
        polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, \
        fog_index, avg_words_per_sentence, complex_word_count, word_count, syllable_per_word, \
        personal_pronouns, avg_word_length, positive_score, negative_score = perform_text_analysis(text)
        if all(v is not None for v in [polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words,
                                       fog_index, avg_words_per_sentence, complex_word_count, word_count,
                                       syllable_per_word, personal_pronouns, avg_word_length, positive_score, negative_score]):
            # Append data to the output list
            output_data.append([url_id, url, title, positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
                                percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count,
                                word_count, syllable_per_word, personal_pronouns, avg_word_length])
        else:
            print(f"Error: Text analysis failed for {url_id}")
    else:
        print(f"Error: Failed to extract text for {url_id}")

# Create DataFrame for output
output_columns = ["URL_ID", "URL", "Title", "Positive Score", "Negative Score", "POLARITY SCORE", "SUBJECTIVITY SCORE", "AVG SENTENCE LENGTH",
                  "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE",
                  "COMPLEX WORD COUNT", "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS",
                  "AVG WORD LENGTH"]
df_output = pd.DataFrame(output_data, columns=output_columns)

# Save output to Excel file
output_file = "Output.xlsx"
df_output.to_excel(output_file, index=False)
print("Output saved successfully.")


Processing blackassign0001...
Processing blackassign0002...
Processing blackassign0003...
Processing blackassign0004...
Processing blackassign0005...
Processing blackassign0006...
Processing blackassign0007...
Processing blackassign0008...
Processing blackassign0009...
Processing blackassign0010...
Processing blackassign0011...
Processing blackassign0012...
Processing blackassign0013...
Processing blackassign0014...
Processing blackassign0015...
Processing blackassign0016...
Processing blackassign0017...
Processing blackassign0018...
Processing blackassign0019...
Processing blackassign0020...
Processing blackassign0021...
Processing blackassign0022...
Processing blackassign0023...
Processing blackassign0024...
Processing blackassign0025...
Processing blackassign0026...
Processing blackassign0027...
Processing blackassign0028...
Processing blackassign0029...
Processing blackassign0030...
Processing blackassign0031...
Processing blackassign0032...
Processing blackassign0033...
Processing