In [1]:
pip install selenium


Note: you may need to restart the kernel to use updated packages.


In [5]:
import selenium
print(selenium.__version__)


4.28.1


In [2]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict

# Install NLTK datasets
nltk.download("punkt")
nltk.download("cmudict")
pron_dict = cmudict.dict()

# Load input file
df = pd.read_excel("Input.xlsx")

# Function to extract text using BeautifulSoup
def extract_text_bs4(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return None, None
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("h1").text.strip() if soup.find("h1") else "No Title"
        paragraphs = soup.find_all("p")
        article_text = " ".join([p.text.strip() for p in paragraphs])
        return title, article_text
    except:
        return None, None

# Function to analyze text
def analyze_text(text):
    blob = TextBlob(text)
    
    # Sentiment Analysis
    positive_score = sum(1 for word in blob.words if TextBlob(word).sentiment.polarity > 0)
    negative_score = sum(1 for word in blob.words if TextBlob(word).sentiment.polarity < 0)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Readability & Word Analysis
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    complex_words = sum(1 for word in words if len(pron_dict.get(word.lower(), [[word]])[0]) > 2)
    syllables_per_word = sum(len(pron_dict.get(word.lower(), [[word]])[0]) for word in words) / len(words)
    avg_sentence_length = len(words) / len(sentences)
    percentage_complex_words = (complex_words / len(words)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_word_length = sum(len(word) for word in words) / len(words)
    
    # Count Personal Pronouns
    personal_pronouns = sum(1 for word in words if word.lower() in ["i", "we", "my", "ours", "us"])

    return {
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Avg Sentence Length": avg_sentence_length,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Avg Number of Words Per Sentence": avg_sentence_length,
        "Complex Word Count": complex_words,
        "Word Count": len(words),
        "Syllable Per Word": syllables_per_word,
        "Personal Pronouns": personal_pronouns,
        "Avg Word Length": avg_word_length
    }

# Extract data and analyze
results = []

for index, row in df.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    print(f"Processing {url_id}: {url}")

    # Extract text using BeautifulSoup
    title, article = extract_text_bs4(url)

    if title and article:
        # Save text to file
        with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(title + "\n" + article)

        # Perform text analysis
        analysis_result = analyze_text(article)
        analysis_result["URL_ID"] = url_id
        analysis_result["URL"] = url

        results.append(analysis_result)

# Convert results to DataFrame
df_results = pd.DataFrame(results)

# Save to Output.xlsx
df_results.to_excel("Output.xlsx", index=False)

print("Processing complete! Output saved to Output.xlsx")


[nltk_data] Downloading package punkt to C:\Users\sonu
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to C:\Users\sonu
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Processing Netclan20241017: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Processing Netclan20241018: https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
Processing Netclan20241019: https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
Processing Netclan20241020: https://insights.blackcoffer.com/efficient-processing-and-analysis-of-financial-data-from-pdf-files-addressing-formatting-inconsistencies-and-ensuring-data-integrity-for-a-toyota-dealership-management-firm/
Processing Netclan20241021: https://insights.blackcoffer.com/development-of-ea-robot-for-automated-trading/
Processing Netclan20241022: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscr

Processing Netclan20241073: https://insights.blackcoffer.com/golden-record-a-knowledge-graph-database-approach-to-unfold-discovery-using-neo4j/
Processing Netclan20241074: https://insights.blackcoffer.com/advanced-ai-for-trading-automation/
Processing Netclan20241075: https://insights.blackcoffer.com/create-a-knowledge-graph-to-provide-real-time-analytics-recommendations-and-a-single-source-of-truth/
Processing Netclan20241076: https://insights.blackcoffer.com/advanced-ai-for-thermal-person-detection/
Processing Netclan20241077: https://insights.blackcoffer.com/advanced-ai-for-road-cam-threat-detection/
Processing Netclan20241078: https://insights.blackcoffer.com/advanced-ai-for-pedestrian-crossing-safety/
Processing Netclan20241079: https://insights.blackcoffer.com/handgun-detection-using-yolo/
Processing Netclan20241080: https://insights.blackcoffer.com/using-graph-technology-to-create-single-customer-view/
Processing Netclan20241081: https://insights.blackcoffer.com/car-detection-in

Processing Netclan20241141: https://insights.blackcoffer.com/immigration-datawarehouse-ai-based-recommendations/
Processing Netclan20241142: https://insights.blackcoffer.com/lipsync-automation-for-celebrities-and-influencers/
Processing Netclan20241143: https://insights.blackcoffer.com/key-audit-matters-predictive-modeling/
Processing Netclan20241144: https://insights.blackcoffer.com/splitting-of-songs-into-its-vocals-and-instrumental/
Processing Netclan20241145: https://insights.blackcoffer.com/ai-and-ml-technologies-to-evaluate-learning-assessments/
Processing Netclan20241146: https://insights.blackcoffer.com/datawarehouse-and-recommendations-engine-for-airbnb/
Processing Netclan20241147: https://insights.blackcoffer.com/real-estate-data-warehouse/
Processing Netclan20241148: https://insights.blackcoffer.com/traction-dashboards-of-marketing-campaigns-and-posts/
Processing Netclan20241149: https://insights.blackcoffer.com/google-local-service-ads-lsa-data-warehouse/
Processing Netclan

In [6]:
import pandas as pd

# Load the output file
df_output = pd.read_excel("Output.xlsx")

# Display first 5 rows
print(df_output.head())


Empty DataFrame
Columns: []
Index: []


In [7]:
# Print column names
print(df_output.columns)


Index([], dtype='object')


In [4]:
import pandas as pd

df = pd.read_excel("Output_Corrected.xlsx")

# Check if any missing values
print(df.isnull().sum())

# Check if any Word Count is 0 (indicating failed extraction)
print("Articles with 0 words:", df[df["Word Count"] == 0])

 

URL_ID                              0
URL                                 0
Positive Score                      0
Negative Score                      0
Polarity Score                      0
Subjectivity Score                  0
Avg Sentence Length                 0
Percentage of Complex Words         0
Fog Index                           0
Avg Number of Words Per Sentence    0
Complex Word Count                  0
Word Count                          0
Syllable Per Word                   0
Personal Pronouns                   0
Avg Word Length                     0
dtype: int64
Articles with 0 words: Empty DataFrame
Columns: [URL_ID, URL, Positive Score, Negative Score, Polarity Score, Subjectivity Score, Avg Sentence Length, Percentage of Complex Words, Fog Index, Avg Number of Words Per Sentence, Complex Word Count, Word Count, Syllable Per Word, Personal Pronouns, Avg Word Length]
Index: []
