In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import nltk
import re
import openpyxl

In [3]:
from concurrent.futures import ThreadPoolExecutor
import os

# Load the input file
df = pd.read_excel('Input.xlsx')

urls = df['URL'].tolist()
url_ids = df['URL_ID'].tolist()

# Function to extract text from a URL
def extract_text(url, url_id):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the article title and text
        title = soup.find('h1').get_text() if soup.find('h1') else 'No Title'
        paragraphs = soup.find_all('p')
        article_text = ' '.join([para.get_text() for para in paragraphs])

        # Save to a text file
        with open(f'articles/{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(title + '\n' + article_text)

    except Exception as e:
        print(f"Error extracting {url}: {e}")

# Create a directory to save the articles
if not os.path.exists('articles'):
    os.makedirs('articles')

# Use ThreadPoolExecutor to speed up the process
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(lambda p: extract_text(*p), zip(urls, url_ids))


In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

# Load positive and negative words
s)


[nltk_data] Downloading package punkt to C:\Users\Md
[nltk_data]     Rashid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
import re
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

positive_words = set(open('positive-words.txt').read().split())
negative_words = set(open('negative-words.txt').read().split())

def analyze_text(text):
    words = re.findall(r'\w+', text)
    word_count = len(words)
    
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
    
    sentences = nltk.sent_tokenize(text)
    avg_sentence_length = word_count / len(sentences) if sentences else 0
    
    def syllables_per_word(word):
        return len([char for char in word if char in 'aeiouAEIOU'])
    
    syllable_counts = [syllables_per_word(word) for word in words]
    complex_words_count = sum(1 for count in syllable_counts if count > 2)
    percentage_of_complex_words = complex_words_count / word_count if word_count else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
    
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    avg_word_length = sum(len(word) for word in words) / word_count if word_count else 0
    
    return {
        'word_count': word_count,
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_of_complex_words': percentage_of_complex_words,
        'fog_index': fog_index,
        'complex_word_count': complex_words_count,
        'syllables_per_word': sum(syllable_counts) / word_count if word_count else 0,
        'personal_pronouns': personal_pronouns,
        'avg_word_length': avg_word_length
    }

# Example usage





[nltk_data] Downloading package punkt to C:\Users\Md
[nltk_data]     Rashid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Md Rashid\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [10]:
results = []

for url_id in url_ids:
    try:
        with open(f'articles/{url_id}.txt', 'r', encoding='latin-1') as file:
            text = file.read()

        pos_score = positive_score(text)
        neg_score = negative_score(text)
        polarity = polarity_score(text)
        subjectivity = subjectivity_score(text)
        avg_sent_len = avg_sentence_length(text)
        percent_complex_words = percentage_complex_words(text)
        fog_idx = fog_index(text)
        avg_words_per_sentence = avg_number_of_words_per_sentence(text)
        complex_word_cnt = complex_word_count(text)
        word_cnt = word_count(text)
        syll_per_word = syllable_per_word(text)
        personal_pronoun_cnt = personal_pronouns(text)
        avg_word_len = avg_word_length(text)

        results.append([url_id, pos_score, neg_score, polarity, subjectivity, avg_sent_len,
                        percent_complex_words, fog_idx, avg_words_per_sentence,
                        complex_word_cnt, word_cnt, syll_per_word, personal_pronoun_cnt,
                        avg_word_len])
    except Exception as e:
        print(f"Error processing {url_id}: {e}")

output_df = pd.DataFrame(results, columns=[
    'URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 
    'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 
    'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 
    'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])

output_df.to_excel('Output_Data_Structure.xlsx', index=False)


  output_df.to_excel('Output_Data_Structure.xlsx', index=False)
