In [1]:
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
import requests
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
#Load the excel file
data = pd.read_excel('Input.xlsx')
data.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [3]:
output_directry = 'extracted_items'
os.makedirs(output_directry, exist_ok=True)

In [4]:
output_strcture_file = 'Output Data Structure.xlsx'
output_dt = pd.read_excel(output_strcture_file)

In [5]:
# Load the extracted text files
extracted_text_directory = 'extracted_text'
extracted_txts = []
for filename in os.listdir(extracted_text_directory):
    with open(os.path.join(extracted_text_directory, filename), 'r', encoding='utf-8') as file:
        text = file.read()
        extracted_txts.append(text)

In [6]:
positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
avg_sentence_lengths = []
percentage_complex_words = []
fog_indexes = []
avg_words_per_sentence = []
complex_word_counts = []
word_counts = []
syllables_per_word = []
personal_pronouns = []
avg_word_lengths = []

In [8]:
#loop through the rows in the DataFrame
for index, row in data.iterrows():
    url_id = row['URL_ID']
    article_url = row['URL']
    
    # Access the URL and extract article text
    response = requests.get(article_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find article title
        article_title = soup.find('title').get_text()
        
        # Find and extract article text
        article_text = ''
        article_contents = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        for content in article_contents:
            article_text += content.get_text() + '\n'
        
        # Save the extracted text to a file
        filename = os.path.join(output_directry, f'{url_id}.txt')
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(article_title + '\n')
            file.write(article_text)
        
        # Text Analysis for each extracted text
        blob = TextBlob(article_text)
        
        # Sentiment Analysis
        sentiment = blob.sentiment
        polarity_scores.append(sentiment.polarity)
        subjectivity_scores.append(sentiment.subjectivity)
    
        # Calculate positive and negative scores based on polarity
        positive_score = max(sentiment.polarity, 0)  # Assigning polarity as positive score if positive
        negative_score = max(-sentiment.polarity, 0)  # Assigning negative of polarity as negative score if negative
    
        positive_scores.append(positive_score)
        negative_scores.append(negative_score)
        # Tokenization
        sentences = sent_tokenize(article_text)
        words = word_tokenize(article_text)
        
        # Average Sentence Length
        avg_sentence_lengths.append(len(words) / len(sentences))
        
        # Percentage of Complex Words
        stopwords_set = set(stopwords.words('english'))
        complex_word_count = sum(1 for word in words if word.lower() not in stopwords_set)
        percentage_complex = (complex_word_count / len(words)) * 100
        percentage_complex_words.append(percentage_complex)
        
        # FOG Index
        fog_index = 0.4 * (avg_sentence_lengths[-1] + percentage_complex)
        fog_indexes.append(fog_index)
        
        # Average Number of Words per Sentence
        avg_words_per_sentence.append(len(words) / len(sentences))
        
        # Complex Word Count
        complex_word_counts.append(complex_word_count)
        
        # Word Count
        word_counts.append(len(words))
        
        # Syllables per Word (approximate)
        syllables = sum(len(word) / 3 for word in words)  # Approximation
        syllables_per_word.append(syllables / len(words))
        
        # Personal Pronouns (counting first and second person pronouns)
        personal_pronoun_count = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'])
        personal_pronouns.append(personal_pronoun_count)
        
        # Average Word Length
        
        avg_word_length = sum(len(word) for word in words) / len(words)
        avg_word_lengths.append(avg_word_length)
        
        print(f'Extracted and analyzed text for {url_id}')
    else:
        print(f'Failed to access {article_url}')
        
        # Handle missing data by assigning default values for computed variables
        
        positive_scores.append(0)
        negative_scores.append(0)
        polarity_scores.append(0)
        subjectivity_scores.append(0)
        avg_sentence_lengths.append(0)
        percentage_complex_words.append(0)
        fog_indexes.append(0)
        avg_words_per_sentence.append(0)
        complex_word_counts.append(0)
        word_counts.append(0)
        syllables_per_word.append(0)
        personal_pronouns.append(0)
        avg_word_lengths.append(0)

Extracted and analyzed text for 123.0
Extracted and analyzed text for 321.0
Extracted and analyzed text for 2345.0
Extracted and analyzed text for 4321.0
Extracted and analyzed text for 432.0
Extracted and analyzed text for 2893.8
Extracted and analyzed text for 3355.6
Extracted and analyzed text for 3817.4
Extracted and analyzed text for 4279.2
Extracted and analyzed text for 4741.0
Extracted and analyzed text for 5202.8
Extracted and analyzed text for 5664.6
Extracted and analyzed text for 6126.4
Extracted and analyzed text for 6588.2
Extracted and analyzed text for 7050.0
Extracted and analyzed text for 7511.8
Extracted and analyzed text for 7973.6
Extracted and analyzed text for 8435.4
Extracted and analyzed text for 8897.2
Extracted and analyzed text for 9359.0
Extracted and analyzed text for 9820.8
Extracted and analyzed text for 10282.6
Extracted and analyzed text for 10744.4
Extracted and analyzed text for 11206.2
Failed to access https://insights.blackcoffer.com/how-neural-net

In [9]:
# Create a DataFrame to store the computed variables
output_dt = pd.DataFrame()
output_dt['URL_ID'] = data['URL_ID']
output_dt['URL'] = data['URL']
output_dt['POSITIVE SCORE'] = positive_scores
output_dt['NEGATIVE SCORE'] = negative_scores
output_dt['POLARITY SCORE'] = polarity_scores
output_dt['SUBJECTIVITY SCORE'] = subjectivity_scores
output_dt['AVG SENTENCE LENGTH'] = avg_sentence_lengths
output_dt['PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
output_dt['FOG INDEX'] = fog_indexes
output_dt['AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
output_dt['COMPLEX WORD COUNT'] = complex_word_counts
output_dt['WORD COUNT'] = word_counts
output_dt['SYLLABLE PER WORD'] = syllables_per_word
output_dt['PERSONAL PRONOUNS'] = personal_pronouns
output_dt['AVG WORD LENGTH'] = avg_word_lengths

In [10]:
# Save the results
output_result_file = 'output_results.xlsx'
output_dt.to_excel(output_result_file, index=False)
