# Data Extraction

In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# read the file
input_file = pd.read_excel('Input.xlsx')

# create the output folder if it doesn't exist
output_folder_Txt = 'Output'
if not os.path.exists(output_folder_Txt):
    os.makedirs(output_folder_Txt)

# loop through each URL in the input file
for index, row in input_file.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # send a request to the URL and get the response
    response = requests.get(url)

    # parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the article title and text
    article_title = ''
    title_element = soup.find('h1')
    if title_element is not None:
        article_title = title_element.text.strip()
    article_text = ''
    article_contents = soup.find('div', class_='entry-content')
    if article_contents is not None:
        for paragraph in article_contents.find_all('p'):
            article_text += paragraph.text.strip() + '\n'

    # save the article text to a file
    filename = os.path.join(output_folder_Txt, f'{url_id}.txt')
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(article_title + '\n')
        f.write(article_text)


# Data Analysis

In [38]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download stopwords and wordnet corpus
#nltk.download('stopwords')
#nltk.download('wordnet')

# loop through each URL in the input file
for index, row in input_file.iterrows():
    url_id = row['URL_ID']

    # read the article text from the file
    filename = f'Output/{url_id}.txt'
    with open(filename, 'r', encoding='utf-8') as f:
        article_text = f.read()

    # check if article_text is not empty
    if not article_text or len(word_tokenize(article_text)) == 0:
        continue

    # perform NLP analysis on the article text
    # compute word count
    word_count = len(word_tokenize(article_text))

    # compute sentence count
    sentence_count = len(sent_tokenize(article_text))

    # compute average sentence length
    words_per_sentence = word_count / sentence_count

    # remove stopwords and lemmatize the article text
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(article_text)
    filtered_tokens = [wordnet_lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]

    # compute unique word count
    unique_word_count = len(set(filtered_tokens))

    # compute lexical diversity
    lexical_diversity = unique_word_count / word_count

    # print the computed variables
    print(f'URL_ID: {url_id}')
    print(f'Word Count: {word_count}')
    print(f'Sentence Count: {sentence_count}')
    print(f'Words per Sentence: {words_per_sentence:.2f}')
    print(f'Unique Word Count: {unique_word_count}')
    print(f'Lexical Diversity: {lexical_diversity:.2f}')


URL_ID: 37
Word Count: 7
Sentence Count: 1
Words per Sentence: 7.00
Unique Word Count: 5
Lexical Diversity: 0.71
URL_ID: 38
Word Count: 10
Sentence Count: 1
Words per Sentence: 10.00
Unique Word Count: 4
Lexical Diversity: 0.40
URL_ID: 39
Word Count: 11
Sentence Count: 1
Words per Sentence: 11.00
Unique Word Count: 6
Lexical Diversity: 0.55
URL_ID: 40
Word Count: 11
Sentence Count: 1
Words per Sentence: 11.00
Unique Word Count: 6
Lexical Diversity: 0.55
URL_ID: 41
Word Count: 9
Sentence Count: 1
Words per Sentence: 9.00
Unique Word Count: 5
Lexical Diversity: 0.56
URL_ID: 42
Word Count: 11
Sentence Count: 1
Words per Sentence: 11.00
Unique Word Count: 6
Lexical Diversity: 0.55
URL_ID: 43
Word Count: 10
Sentence Count: 1
Words per Sentence: 10.00
Unique Word Count: 6
Lexical Diversity: 0.60
URL_ID: 45
Word Count: 8
Sentence Count: 1
Words per Sentence: 8.00
Unique Word Count: 5
Lexical Diversity: 0.62
URL_ID: 46
Word Count: 8
Sentence Count: 1
Words per Sentence: 8.00
Unique Word Count:

URL_ID: 122
Word Count: 7
Sentence Count: 1
Words per Sentence: 7.00
Unique Word Count: 4
Lexical Diversity: 0.57
URL_ID: 123
Word Count: 5
Sentence Count: 1
Words per Sentence: 5.00
Unique Word Count: 4
Lexical Diversity: 0.80
URL_ID: 124
Word Count: 10
Sentence Count: 1
Words per Sentence: 10.00
Unique Word Count: 7
Lexical Diversity: 0.70
URL_ID: 125
Word Count: 8
Sentence Count: 1
Words per Sentence: 8.00
Unique Word Count: 5
Lexical Diversity: 0.62
URL_ID: 126
Word Count: 11
Sentence Count: 1
Words per Sentence: 11.00
Unique Word Count: 7
Lexical Diversity: 0.64
URL_ID: 127
Word Count: 9
Sentence Count: 1
Words per Sentence: 9.00
Unique Word Count: 5
Lexical Diversity: 0.56
URL_ID: 128
Word Count: 5
Sentence Count: 1
Words per Sentence: 5.00
Unique Word Count: 3
Lexical Diversity: 0.60
URL_ID: 129
Word Count: 7
Sentence Count: 1
Words per Sentence: 7.00
Unique Word Count: 5
Lexical Diversity: 0.71
URL_ID: 130
Word Count: 5
Sentence Count: 1
Words per Sentence: 5.00
Unique Word Cou

In [None]:
#Definition of each of the variables given in the “Text Analysis.docx” file

In [40]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import syllables

# read the input Excel file
input_file = pd.read_excel('Input.xlsx')

# create a dictionary to store the output
output_dict = {}

# loop through each URL in the input file
for index, row in input_file.iterrows():
    url_id = row['URL_ID']

    # read the article text from the file
    filename = f'Output/{url_id}.txt'
    with open(filename, 'r', encoding='utf-8') as f:
        article_text = f.read()

    # perform NLP analysis on the article text
    # compute word count
    word_count = len(word_tokenize(article_text))

    # compute sentence count
    sentence_count = len(sent_tokenize(article_text))

    # compute average sentence length
    if sentence_count > 0:
        words_per_sentence = word_count / sentence_count
    else:
        words_per_sentence = 0

    # remove stopwords and lemmatize the article text
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(article_text)
    filtered_tokens = [wordnet_lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]

    # compute unique word count
    unique_word_count = len(set(filtered_tokens))

    # compute lexical diversity
    if word_count > 0:
        lexical_diversity = unique_word_count / word_count
    else:
        lexical_diversity = 0

    # compute percentage of complex words
    complex_words = [word for word in filtered_tokens if len(word) > 2 and TextBlob(word).polarity != 0]
    if word_count > 0:
        percentage_complex_words = len(complex_words) / word_count
    else:
        percentage_complex_words = 0

    # compute FOG index
    if sentence_count > 0:
        fog_index = 0.4 * (words_per_sentence + percentage_complex_words * 100)
    else:
        fog_index = 0

    # compute syllables per word
    syllables_per_word = 0
    if word_count > 0:
        for word in filtered_tokens:
            syllables_per_word += syllables.estimate(word)
        syllables_per_word /= word_count

    # count personal pronouns
    personal_pronouns = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    personal_pronoun_count = sum(filtered_tokens.count(word) for word in personal_pronouns)

    # compute average word length
    if word_count > 0:
        total_word_length = sum(len(word) for word in filtered_tokens)
        avg_word_length = total_word_length / word_count
    else:
        avg_word_length = 0

   

    # compute positive score, negative score, polarity score, and subjectivity score using TextBlob
    blob = TextBlob(article_text)
    sentiment = blob.sentiment
    positive_score = sentiment.polarity if sentiment.polarity > 0 else 0
    negative_score = -sentiment.polarity if sentiment.polarity < 0 else 0
    polarity_score = sentiment.polarity
    subjectivity_score = sentiment.subjectivity

    # add the computed variables to the output dictionary
    output_dict[url_id] = {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': words_per_sentence,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': words_per_sentence,
        'COMPLEX WORD COUNT': len(complex_words),
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronoun_count,
        'AVG WORD LENGTH': avg_word_length
        }

In [42]:
#Output Data Structure

In [41]:
output_df = pd.DataFrame.from_dict(output_dict, orient='index')
output_df.to_excel('Output.xlsx', index_label='URL_ID')