<a href="https://colab.research.google.com/github/rgunasree/gunasree/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import urllib.request
from bs4 import BeautifulSoup
import requests

# Define a function to extract text from a URL
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = ' '.join(soup.stripped_strings)
        return text
    except Exception as e:
        print(f"Error retrieving text from URL: {e}")
        return None

# Define a function to compute derived variables
def compute_derived_variables(text):
    # Clean the text
    cleaned_words = word_tokenize(text.lower())
    cleaned_words = [word for word in cleaned_words if word.isalpha()]

    # Compute positive, negative, and polarity scores
    positive_score = TextBlob(text).sentiment.polarity
    negative_score = TextBlob(text).sentiment.subjectivity
    polarity_score = TextBlob(text).sentiment.polarity
    subjectivity_score = TextBlob(text).sentiment.subjectivity

    # Calculate syllable count per word
    def syllable_count(word):
        syllables = 0
        vowels = "aeiouy"
        if word[0] in vowels:
            syllables += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                syllables += 1
        if word.endswith("e"):
            syllables -= 1
        if syllables == 0:
            syllables += 1
        return syllables

    # Calculate syllable count per word
    syllable_counts = [syllable_count(word) for word in cleaned_words]

    # Count complex words (words with more than two syllables)
    complex_word_count = sum(1 for syllables in syllable_counts if syllables > 2)

    # Count personal pronouns
    personal_pronouns_count = sum(1 for word in cleaned_words if word.lower() in ['i', 'we', 'my', 'ours', 'us'])

    # Calculate average word length
    total_characters = sum(len(word) for word in cleaned_words)
    average_word_length = total_characters / len(cleaned_words)

    # Calculate average sentence length
    sentences = sent_tokenize(text)
    average_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

    # Calculate percentage of complex words
    percentage_of_complex_words = (complex_word_count / len(cleaned_words)) * 100

    # Calculate fog index
    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)

    # Calculate average number of words per sentence
    average_words_per_sentence = len(cleaned_words) / len(sentences)

    # Calculate syllables per word
    syllables_per_word = sum(syllable_counts) / len(cleaned_words)

    # Return derived variables
    derived_variables = {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Average Sentence Length': average_sentence_length,
        'Percentage of Complex Words': percentage_of_complex_words,
        'Fog Index': fog_index,
        'Average Number of Words Per Sentence': average_words_per_sentence,
        'Complex Word Count': complex_word_count,
        'Word Count': len(cleaned_words),
        'Syllables Per Word': syllables_per_word,
        'Personal Pronouns': personal_pronouns_count,
        'Average Word Length': average_word_length
    }

    return derived_variables

# Load the Input.xlsx file
input_df = pd.read_excel('Input.xlsx')

# Iterate over each URL and compute derived variables
results_list = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract text from the URL
    text = extract_text_from_url(url)

    # Compute derived variables
    derived_variables = compute_derived_variables(text)
    derived_variables['URL_ID'] = url_id
    derived_variables['URL'] = url
    results_list.append(derived_variables)

# Create a DataFrame for the results
results_columns = ['URL_ID', 'URL', 'Positive Score', 'Negative Score', 'Polarity Score', 'Subjectivity Score', 'Average Sentence Length', 'Percentage of Complex Words', 'Fog Index', 'Average Number of Words Per Sentence', 'Complex Word Count', 'Word Count', 'Syllables Per Word', 'Personal Pronouns', 'Average Word Length']
df_results = pd.DataFrame(results_list, columns=results_columns)

# Save the results to an Excel file
df_results.to_excel('Output Data Structure.xlsx', index=False)

print("Results saved to 'Output Data Structure.xlsx'.")

Results saved to 'Output Data Structure.xlsx'.
