In [1]:
!pip install pandas nltk

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import requests
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHUBHAM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHUBHAM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove scripts and styles from the HTML
    for script in soup(['script', 'style']):
        script.extract()

    # Extract article text
    article_text = ""
    for paragraph in soup.find_all('p'):
        article_text += paragraph.get_text() + "\n"

    return article_text.strip()

In [4]:
def compute_word_count(text):
    tokens = word_tokenize(text)
    return len(tokens)

In [5]:
def compute_unique_word_count(text):
    tokens = word_tokenize(text)
    unique_tokens = set(tokens)
    return len(unique_tokens)

In [6]:
def compute_stopword_count(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    stopword_count = sum(1 for word in tokens if word.lower() in stop_words)
    return stopword_count

In [7]:
def compute_most_common_words(text, n=10):
    tokens = word_tokenize(text)
    fdist = FreqDist(tokens)
    most_common_words = fdist.most_common(n)
    return ', '.join([word[0] for word in most_common_words])

In [8]:
def compute_average_word_length(text):
    tokens = word_tokenize(text)
    word_lengths = [len(word) for word in tokens]
    return sum(word_lengths) / len(word_lengths)

In [9]:
def compute_average_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    return sum(sentence_lengths) / len(sentence_lengths)

In [10]:
# Read the data from input.xlsx
input_file = "input.xlsx"
df = pd.read_excel(input_file)

In [11]:
output_structure = {
    'URL_ID': [],
    'Word_Count': [],
    'Unique_Word_Count': [],
    'Stopword_Count': [],
    'Most_Common_Words': [],
    'Average_Word_Length': [],
    'Average_Sentence_Length': []
}

In [12]:
# Process each URL and extract article text
for index, row in df.iterrows():
    url = row['URL']
    url_id = re.sub(r'\W+', '', url)  # Remove non-alphanumeric characters from the URL

    article_text = extract_article_text(url)
    word_count = compute_word_count(article_text)
    unique_word_count = compute_unique_word_count(article_text)
    stopword_count = compute_stopword_count(article_text)
    most_common_words = compute_most_common_words(article_text)
    average_word_length = compute_average_word_length(article_text)
    average_sentence_length = compute_average_sentence_length(article_text)

    # Append the computed values to the output data structure
    output_structure['URL_ID'].append(url_id)
    output_structure['Word_Count'].append(word_count)
    output_structure['Unique_Word_Count'].append(unique_word_count)
    output_structure['Stopword_Count'].append(stopword_count)
    output_structure['Most_Common_Words'].append(most_common_words)
    output_structure['Average_Word_Length'].append(average_word_length)
    output_structure['Average_Sentence_Length'].append(average_sentence_length)

In [13]:
# Create the output DataFrame and save it to output.xlsx
output_df = pd.DataFrame(output_structure)
output_df.to_excel('output.xlsx', index=False)