<a href="https://colab.research.google.com/github/pushpendracse/DataAnalyst/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install pyphen



In [19]:
pip install textblob



In [24]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from textblob import download_corpora
import textblob
textblob.download_corpora
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import pyphen
nltk.download('punkt_tab')


positive_words = {"good", "happy", "excellent", "positive", "joy", "love", "fortunate", "pleasant", "great", "wonderful"}
negative_words = {"bad", "sad", "terrible", "negative", "anger", "hate", "unfortunate", "horrible", "poor", "awful"}


syllable_counter = pyphen.Pyphen(lang='en')


def extract_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('title').get_text()
        article_text = ' '.join([p.get_text() for p in soup.find_all('p')])
        return title, article_text
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return None, None


def compute_text_metrics(text):
    blob = TextBlob(text)
    positive_score = sum(1 for word in blob.words if word.lower() in positive_words)
    negative_score = sum(1 for word in blob.words if word.lower() in negative_words)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity

    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    avg_sentence_length = len(words) / len(sentences) if sentences else 0
    complex_word_count = sum(1 for word in words if syllable_counter.inserted(word).count('-') + 1 > 2)
    percentage_complex_words = (complex_word_count / len(words) * 100) if words else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    syllables_per_word = sum(syllable_counter.inserted(word).count('-') + 1 for word in words) / len(words) if words else 0
    word_count = len(words)
    avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))

    return {
        "POSITIVE_SCORE": positive_score,
        "NEGATIVE_SCORE": negative_score,
        "POLARITY_SCORE": polarity,
        "SUBJECTIVITY_SCORE": subjectivity,
        "AVG_SENTENCE_LENGTH": avg_sentence_length,
        "PERCENTAGE_OF_COMPLEX_WORDS": percentage_complex_words,
        "FOG_INDEX": fog_index,
        "SYLLABLE_PER_WORD": syllables_per_word,
        "WORD_COUNT": word_count,
        "AVG_WORD_LENGTH": avg_word_length,
        "PERSONAL_PRONOUNS": personal_pronouns
    }


input_file = pd.read_excel('/content/drive/MyDrive/Input.xlsx')
urls = input_file['URL']
url_ids = input_file['URL_ID']


for url, url_id in zip(urls, url_ids):
    title, article = extract_article(url)
    if title and article:
        with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n{article}")


output_data = []
for url_id in url_ids:
    try:
        with open(f"{url_id}.txt", "r", encoding="utf-8") as file:
            text = file.read()
        metrics = compute_text_metrics(text)
        output_data.append({"URL_ID": url_id, **metrics})
    except Exception as e:
        print(f"Error processing file for URL_ID {url_id}: {e}")


output_df = pd.DataFrame(output_data)
output_df.to_excel('Output.xlsx', index=False)

print("Processing complete. Output saved as 'Output.xlsx'.")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing complete. Output saved as 'Output.xlsx'.
