In [1]:
# !pip install requests openpyxl html5lib bs4 pandas nltk textblob 

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import time
import logging
import re
import nltk
from textblob import TextBlob
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize, word_tokenize



In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('cmudict')
syllable_dict = cmudict.dict()

# Load stop words from the uploaded files
stop_words = set()
positive_words = set()
negative_words = set()

stop_lst = ['StopWords/StopWords_Auditor.txt', 'StopWords/StopWords_Names.txt',
            'StopWords/StopWords_DatesandNumbers.txt','StopWords/StopWords_Currencies.txt',
            'StopWords/StopWords_Generic.txt','StopWords/StopWords_GenericLong.txt',
            'StopWords/StopWords_Geographic.txt']

for filename in stop_lst:
    with open(filename, 'r') as file:
        stop_words.update(file.read().lower().splitlines())


with open('MasterDictionary/positive-words.txt', 'r') as file:
    positive_words.update(file.read().splitlines())

with open('MasterDictionary/negative-words.txt', 'r') as file:
    negative_words.update(file.read().splitlines())

# Define text analysis functions
def clean_text(text):
    cleaned_words=[]
    words = word_tokenize(text)
    for word in words:
        if word.isalpha() and (word.lower() not in stop_words):
            cleaned_words.append(word.lower())
    return cleaned_words

def syllable_count(word):
    word = word.lower()
    if word in syllable_dict:
        syllable_counts = [len([x for x in phoneme if x[-1].isdigit()]) for phoneme in syllable_dict[word]]
        return max(syllable_counts) if syllable_counts else 0 
    else:
        return len(re.findall(r'[aeiouy]+', word))  
    
def complex_word_count(words_list):
    return sum(1 for word in words_list if syllable_count(word) >= 3)

def personal_pronoun_count(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))

def analyze_text(text):
    clean_words = clean_text(text)
    sentences = sent_tokenize(text)
    

    # Extracting Derived variables
    p_score = sum(1 for word in clean_words if word in positive_words)
    n_score = sum(1 for word in clean_words if word in negative_words)
    polarity_score = (p_score - n_score) / ((p_score + n_score) + 0.000001)
    subjectivity_score = (p_score + n_score) / (len(clean_words) + 0.000001)
    
    # Analysis of Readability
    avg_sentence_length = len(clean_words) / len(sentences)
    complex_words_count = complex_word_count(clean_words)
    percentage_complex_words = (complex_words_count / len(clean_words)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_words_per_sentence = len(clean_words) / len(sentences)
    
    word_count = len(clean_words)
    syllables_per_word = sum(syllable_count(word) for word in clean_words) / len(clean_words)
    avg_word_length = sum(len(word) for word in clean_words) / len(clean_words)
    personal_pronouns = personal_pronoun_count(text)
    
    return {
        "Positive Score": p_score,
        "Negative Score": n_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Avg Sentence Length": avg_sentence_length,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Avg Words per Sentence": avg_words_per_sentence,
        "Complex Word Count": complex_words_count,
        "Word Count": word_count,
        "Syllable per Word": syllables_per_word,
        "Personal Pronouns": personal_pronouns,
        "Avg Word Length": avg_word_length
    }




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yashy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\yashy\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [4]:

extracted_data = []

# Set up logging
logging.basicConfig(filename='crawler.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')


df = pd.read_excel('Input.xlsx')

os.makedirs("extracted_txt_folder", exist_ok=True)

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    for attempt in range(5):  # Retry for 5 times
        try:
            
            response = requests.get(url, timeout=30)
            response.raise_for_status()  
            
            
            soup = BeautifulSoup(response.text, 'html5lib')

           
            title = soup.find('h1')
            title_text = title.get_text(strip=True) if title else "No Title Found"

            
            article_content = soup.find('div', class_='td-post-content')
            if article_content:

                article_text_lines=[]

                paragraphs = article_content.find_all('p')

                for p in paragraphs:
                    if p.get_text(strip=True).lower().startswith("summarized"):
                            break
                    
                    article_text_lines.append(p.get_text(strip=True))

                article_text = "\n".join(article_text_lines)

            else:
                article_text = 'No Article Text Found'

            file_txt=f"extracted_txt_folder/{url_id}.txt"
            with open(file_txt, "w", encoding="utf-8") as file:
                file.write(f"Title: {title_text}\n\n")
                file.write(article_text)

            analysed_data = analyze_text(article_text)
            extracted_data.append({
                'URL_ID': url_id,
                'Title': title_text,
                 **analysed_data })
            

            logging.info(f"Article with URL_ID {url_id} saved successfully.")
            time.sleep(2)  #Fixed wait_time of 2 seconds after a successful request for handling errors(response time reset etc)
            break  

        except Exception as e:
            logging.error(f"Attempt {attempt + 1} failed for URL_ID {url_id}. Error: {e}")

            if attempt == 4:  # If it's the last attempt
                logging.error(f"Failed to retrieve article with URL_ID {url_id} after multiple attempts. Skipping this URL.")


In [5]:
# Create a new DataFrame from the extracted data
output_df = pd.DataFrame(extracted_data)

# Save the extracted data to a new Excel file
output_df.to_excel("Output Data Structure.xlsx", index=False)



In [6]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   URL_ID                       147 non-null    object 
 1   Title                        147 non-null    object 
 2   Positive Score               147 non-null    int64  
 3   Negative Score               147 non-null    int64  
 4   Polarity Score               147 non-null    float64
 5   Subjectivity Score           147 non-null    float64
 6   Avg Sentence Length          147 non-null    float64
 7   Percentage of Complex Words  147 non-null    float64
 8   Fog Index                    147 non-null    float64
 9   Avg Words per Sentence       147 non-null    float64
 10  Complex Word Count           147 non-null    int64  
 11  Word Count                   147 non-null    int64  
 12  Syllable per Word            147 non-null    float64
 13  Personal Pronouns   

In [9]:
output_df.tail()

Unnamed: 0,URL_ID,Title,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Avg Sentence Length,Percentage of Complex Words,Fog Index,Avg Words per Sentence,Complex Word Count,Word Count,Syllable per Word,Personal Pronouns,Avg Word Length
142,Netclan20241159,Population and Community Survey of America,12,9,0.142857,0.045752,13.5,34.858388,19.343355,13.5,160,459,2.313725,3,6.825708
143,Netclan20241160,Google LSA API Data Automation and Dashboarding,19,17,0.055556,0.049451,11.375,35.302198,18.670879,11.375,257,728,2.262363,7,6.671703
144,Netclan20241161,Healthcare Data Analysis,5,6,-0.090909,0.125,8.0,25.0,13.2,8.0,22,88,2.090909,11,6.579545
145,Netclan20241162,"Budget, Sales KPI Dashboard using Power BI",0,0,0.0,0.0,8.0,75.0,33.2,8.0,6,8,4.625,0,13.375
146,Netclan20241163,"Amazon Buy Bot, an Automation AI tool to Auto-...",2,0,1.0,0.037736,7.571429,35.849057,17.368194,7.571429,19,53,2.226415,1,6.603774
