<a href="https://colab.research.google.com/github/shivpandey2406/NLP/blob/main/Extract_text_from_the_URL_and_calculate_specified_variables_from_the_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas openpyxl requests beautifulsoup4




In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Attempt to extract main article text
        # This is a generic approach; you can customize for specific sites
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])
        return article_text.strip()
    except Exception as e:
        return f"Error: {e}"

# Load Excel file and URLs from column 'B'
input_file = '/content/sample_data/Input.xlsx'  # Change to your file name
df = pd.read_excel(input_file)

# Assuming column 'B' is the second column (index 1)
url_column = df.columns[1]
df['Extracted_Text'] = df[url_column].apply(extract_text_from_url)

# Save results to a new Excel file
output_file = 'output_with_articles.xlsx'
df.to_excel(output_file, index=False)
print(f"Extraction complete. Results saved to {output_file}")


Extraction complete. Results saved to output_with_articles.xlsx


In [5]:
!pip install pandas textblob nltk openpyxl




In [6]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
import re
from textblob import TextBlob

def extract_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Attempt to extract main article text
        # This is a generic approach; you can customize for specific sites
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])
        return article_text.strip()
    except Exception as e:
        return f"Error: {e}"

def load_word_set(filepath):
    # Try opening with utf-8 first
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            words = set(line.strip() for line in f if line.strip() and not line.startswith(';'))
        return words
    except UnicodeDecodeError:
        # If utf-8 fails, try latin-1
        try:
            with open(filepath, 'r', encoding='latin-1') as f:
                 words = set(line.strip() for line in f if line.strip() and not line.startswith(';'))
            print(f"Successfully loaded {filepath} with latin-1 encoding.")
            return words
        except Exception as e:
            print(f"Error loading {filepath} with latin-1 encoding: {e}")
            # Handle or re-raise the error if other encodings fail
            raise e


positive_words = load_word_set('/content/sample_data/positive-words.txt')
negative_words = load_word_set('/content/sample_data/negative-words.txt')

def count_syllables(word):
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    prev_char_was_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False
    if word.endswith("e"):
        count = max(1, count-1)
    return max(count, 1)

def is_complex(word):
    return count_syllables(word) > 2

def count_personal_pronouns(text):
    # Simple regex for personal pronouns
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

def text_metrics(text):
    # Handle non-string input gracefully
    if not isinstance(text, str):
        # Return a series of zeros or NaNs for non-string inputs
        return pd.Series([0] * 13, index=[
            "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
            "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
            "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
            "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
        ])

    blob = TextBlob(text)
    sentences = blob.sentences
    words = blob.words
    word_count = len(words)
    sentence_count = len(sentences)
    if sentence_count == 0: sentence_count = 1

    # Positive/Negative Score
    pos_score = sum(1 for w in words if w.lower() in positive_words)
    neg_score = sum(1 for w in words if w.lower() in negative_words)

    # Polarity/Subjectivity
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity

    # Average sentence length
    avg_sent_len = word_count / sentence_count

    # Complex words
    complex_words = [w for w in words if is_complex(w)]
    complex_word_count = len(complex_words)
    percent_complex = (complex_word_count / word_count) * 100 if word_count else 0

    # Fog Index
    fog_index = 0.4 * (avg_sent_len + percent_complex)

    # Avg words per sentence
    avg_words_per_sentence = avg_sent_len

    # Syllables per word
    syllable_per_word = sum(count_syllables(w) for w in words) / word_count if word_count else 0

    # Personal pronouns
    personal_pronouns = count_personal_pronouns(text)

    # Avg word length
    avg_word_len = sum(len(w) for w in words) / word_count if word_count else 0

    return pd.Series([
        pos_score, neg_score, polarity, subjectivity, avg_sent_len, percent_complex,
        fog_index, avg_words_per_sentence, complex_word_count, word_count, syllable_per_word,
        personal_pronouns, avg_word_len
    ])

# Read Excel
input_file = '/content/sample_data/output_with_articles.xlsx'  # Change to your file
df = pd.read_excel(input_file)

# Apply metrics to column C (index 2)
# Ensure the column is treated as strings before applying text_metrics
df.iloc[:, 2] = df.iloc[:, 2].astype(str)
metrics = df.iloc[:, 2].apply(text_metrics)
metrics.columns = [
    "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
    "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
]

# Concatenate and save
result = pd.concat([df, metrics], axis=1)
result.to_excel('output_with_metrics.xlsx', index=False)
print("Analysis complete. Results saved to output_with_metrics.xlsx")

Successfully loaded /content/sample_data/negative-words.txt with latin-1 encoding.
Analysis complete. Results saved to output_with_metrics.xlsx


In [4]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
import re
from textblob import TextBlob # Still used for sentence splitting and tokenization
from nltk.corpus import stopwords # Import stopwords

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

stop_words = set(stopwords.words('english'))

def extract_text_from_url(url):
    # This function remains the same as before, responsible for fetching text
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])
        return article_text.strip()
    except Exception as e:
        return f"Error: {e}"

def load_word_set(filepath):
    # This function remains the same, loads positive/negative words
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            words = set(line.strip() for line in f if line.strip() and not line.startswith(';'))
        return words
    except UnicodeDecodeError:
        try:
            with open(filepath, 'r', encoding='latin-1') as f:
                 words = set(line.strip() for line in f if line.strip() and not line.startswith(';'))
            print(f"Successfully loaded {filepath} with latin-1 encoding.")
            return words
        except Exception as e:
            print(f"Error loading {filepath} with latin-1 encoding: {e}")
            raise e

# Load your positive and negative word lists
positive_words = load_word_set('/content/sample_data/positive-words.txt')
negative_words = load_word_set('/content/sample_data/negative-words.txt')

def count_syllables(word):
    # Adjusted syllable count based on common rules, including 'es' and 'ed'
    word = word.lower()
    # Remove non-alphabetic characters for syllable counting
    word = re.sub(r'[^a-z]', '', word)

    if len(word) == 0:
        return 0

    count = 0
    vowels = "aeiouy"
    # Handle words ending with 'es' or 'ed'
    if word.endswith("es") or word.endswith("ed"):
         # Check if the letter before 'es' or 'ed' is a vowel.
         # If it is, these endings often add a syllable.
         # If it is not, these endings often do not add a syllable.
         # This is a simplification; actual rules are more complex.
         # Let's count vowels and then adjust for these endings.
         pass # Handle adjustment after vowel counting

    prev_char_was_vowel = False
    for i in range(len(word)):
        if word[i] in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False

    # Adjust for silent 'e' at the end of a word, unless it's a single-letter word 'e'
    if word.endswith("e") and not prev_char_was_vowel and len(word) > 1:
        count = max(1, count - 1)

    # Adjust for 'es' and 'ed' endings
    if len(word) > 2: # Ensure the word is long enough for these endings to matter
        if word.endswith("es"):
             # If the letter before 'es' is s, x, z, ch, sh, or ge, it usually adds a syllable (passes, foxes, buzzes, etc.)
             # Otherwise, it often doesn't (likes)
             if word[-3] in 'sxz' or word[-4:-2] in ('ch', 'sh') or word.endswith('ge'):
                 pass # Don't subtract, assumes vowel counting already got it right or adds one
             else:
                 count = max(1, count - 1) # Often silent 'e' rule applies here, but 'es' adds no sound.
        elif word.endswith("ed"):
            # If the preceding letter is 't' or 'd', it adds a syllable (waited, ended)
            # Otherwise, it often doesn't (liked, played)
            if word[-3] in 'td':
                pass # Don't subtract, assumes vowel counting got it right
            else:
                 count = max(1, count - 1) # Often silent 'e' rule applies here, but 'ed' adds no sound.


    # Ensure a word has at least one syllable
    return max(count, 1)


def is_complex(word):
    # Complex words have more than two syllables
    return count_syllables(word) > 2

def count_personal_pronouns(text):
    # Use a slightly refined regex to avoid matching "US" as a country
    # \b ensures word boundaries. Negative lookbehind (?<!\bUS) asserts that it's not preceded by '\bUS'.
    pronouns = re.findall(r'\b(I|we|my|ours|us)(?<!\bUS)\b', text, re.I)
    return len(pronouns)

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize into words
    words = text.split()
    # Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]
    return cleaned_words

def text_metrics(text):
    # Handle non-string input gracefully
    if not isinstance(text, str):
        return pd.Series([0] * 13, index=[
            "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
            "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
            "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
            "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
        ])

    # Use TextBlob for initial sentence tokenization
    blob = TextBlob(text)
    sentences = blob.sentences
    sentence_count = len(sentences)
    if sentence_count == 0: sentence_count = 1 # Avoid division by zero

    # Clean text for word count and sentiment analysis based on your criteria
    cleaned_words = clean_text(text)
    total_cleaned_word_count = len(cleaned_words)

    # Recalculate word count for other metrics (including stopwords and punctuation for sentence length, complex words, etc.)
    # Tokenize using TextBlob's word tokenization for metrics like avg word length, complex word count
    all_words = blob.words

    # Positive/Negative Score based on cleaned words
    pos_score = sum(1 for w in cleaned_words if w in positive_words)
    neg_score = sum(1 for w in cleaned_words if w in negative_words)

    # Polarity Score based on your formula
    denominator_polarity = pos_score + neg_score + 0.000001
    polarity = (pos_score - neg_score) / denominator_polarity if denominator_polarity else 0

    # Subjectivity Score based on your formula
    denominator_subjectivity = total_cleaned_word_count + 0.000001
    subjectivity = (pos_score + neg_score) / denominator_subjectivity if denominator_subjectivity else 0

    # Average sentence length (based on total words including stopwords, before punctuation removal)
    # This aligns better with a readability index calculation.
    total_words_for_avg_sentence_length = len(all_words)
    avg_sent_len = total_words_for_avg_sentence_length / sentence_count if sentence_count else 0

    # Complex words (based on all words, before cleaning, for readability)
    complex_words = [w for w in all_words if is_complex(w)]
    complex_word_count = len(complex_words)
    # Percentage of Complex words (based on all words)
    percent_complex = (complex_word_count / total_words_for_avg_sentence_length) * 100 if total_words_for_avg_sentence_length else 0

    # Fog Index
    fog_index = 0.4 * (avg_sent_len + percent_complex)

    # Avg words per sentence (same as Average Sentence Length based on total words)
    avg_words_per_sentence = avg_sent_len

    # Syllable Count Per Word (Average) - based on all words
    syllable_per_word = sum(count_syllables(w) for w in all_words) / total_words_for_avg_sentence_length if total_words_for_avg_sentence_length else 0

    # Personal pronouns (based on original text)
    personal_pronouns = count_personal_pronouns(text)

    # Average Word Length (based on all words, characters before cleaning)
    avg_word_len = sum(len(w) for w in all_words) / total_words_for_avg_sentence_length if total_words_for_avg_sentence_length else 0


    return pd.Series([
        pos_score, neg_score, polarity, subjectivity, avg_sent_len, percent_complex,
        fog_index, avg_words_per_sentence, complex_word_count, total_cleaned_word_count, # Use cleaned word count here as specified
        syllable_per_word, personal_pronouns, avg_word_len
    ])

# --- Remainder of the code for loading, applying, and saving ---

# Read Excel
input_file = '/content/sample_data/output_with_articles.xlsx'  # Change to your file
df = pd.read_excel(input_file)

# Apply metrics to the column containing the extracted text (assuming it's the 3rd column, index 2)
text_column_index = 2
df.iloc[:, text_column_index] = df.iloc[:, text_column_index].astype(str)
metrics = df.iloc[:, text_column_index].apply(text_metrics)
metrics.columns = [
    "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT", # This is now the cleaned word count
    "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
]

# Concatenate and save
result = pd.concat([df, metrics], axis=1)
output_file = 'output_with_metrics.xlsx'
result.to_excel(output_file, index=False)
print(f"Analysis complete. Results saved to {output_file}")

Successfully loaded /content/sample_data/negative-words.txt with latin-1 encoding.
Analysis complete. Results saved to output_with_metrics.xlsx


In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
import re
from textblob import TextBlob # Still used for sentence splitting and tokenization
from nltk.corpus import stopwords # Import stopwords

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

stop_words = set(stopwords.words('english'))

def extract_text_from_url(url):
    # This function remains the same as before, responsible for fetching text
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])
        return article_text.strip()
    except Exception as e:
        return f"Error: {e}"

def load_word_set(filepath):
    # This function remains the same, loads positive/negative words
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            words = set(line.strip() for line in f if line.strip() and not line.startswith(';'))
        return words
    except UnicodeDecodeError:
        try:
            with open(filepath, 'r', encoding='latin-1') as f:
                 words = set(line.strip() for line in f if line.strip() and not line.startswith(';'))
            print(f"Successfully loaded {filepath} with latin-1 encoding.")
            return words
        except Exception as e:
            print(f"Error loading {filepath} with latin-1 encoding: {e}")
            raise e

# Load your positive and negative word lists
positive_words = load_word_set('/content/sample_data/positive-words.txt')
negative_words = load_word_set('/content/sample_data/negative-words.txt')

def count_syllables(word):
    # Adjusted syllable count based on common rules, including 'es' and 'ed'
    word = word.lower()
    # Remove non-alphabetic characters for syllable counting
    word = re.sub(r'[^a-z]', '', word)

    if len(word) == 0:
        return 0

    count = 0
    vowels = "aeiouy"
    # Handle words ending with 'es' or 'ed'
    if word.endswith("es") or word.endswith("ed"):
         # Check if the letter before 'es' or 'ed' is a vowel.
         # If it is, these endings often add a syllable.
         # If it is not, these endings often do not add a syllable.
         # This is a simplification; actual rules are more complex.
         # Let's count vowels and then adjust for these endings.
         pass # Handle adjustment after vowel counting

    prev_char_was_vowel = False
    for i in range(len(word)):
        if word[i] in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False

    # Adjust for silent 'e' at the end of a word, unless it's a single-letter word 'e'
    if word.endswith("e") and not prev_char_was_vowel and len(word) > 1:
        count = max(1, count - 1)

    # Adjust for 'es' and 'ed' endings
    if len(word) > 2: # Ensure the word is long enough for these endings to matter
        if word.endswith("es"):
             # If the letter before 'es' is s, x, z, ch, sh, or ge, it usually adds a syllable (passes, foxes, buzzes, etc.)
             # Otherwise, it often doesn't (likes)
             if word[-3] in 'sxz' or word[-4:-2] in ('ch', 'sh') or word.endswith('ge'):
                 pass # Don't subtract, assumes vowel counting already got it right or adds one
             else:
                 count = max(1, count - 1) # Often silent 'e' rule applies here, but 'es' adds no sound.
        elif word.endswith("ed"):
            # If the preceding letter is 't' or 'd', it adds a syllable (waited, ended)
            # Otherwise, it often doesn't (liked, played)
            if word[-3] in 'td':
                pass # Don't subtract, assumes vowel counting got it right
            else:
                 count = max(1, count - 1) # Often silent 'e' rule applies here, but 'ed' adds no sound.


    # Ensure a word has at least one syllable
    return max(count, 1)


def is_complex(word):
    # Complex words have more than two syllables
    return count_syllables(word) > 2

def count_personal_pronouns(text):
    # Use a slightly refined regex to avoid matching "US" as a country
    # \b ensures word boundaries. Negative lookbehind (?<!\bUS) asserts that it's not preceded by '\bUS'.
    pronouns = re.findall(r'\b(I|we|my|ours|us)(?<!\bUS)\b', text, re.I)
    return len(pronouns)

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize into words
    words = text.split()
    # Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]
    return cleaned_words

def text_metrics(text):
    # Handle non-string input gracefully
    if not isinstance(text, str):
        return pd.Series([0] * 13, index=[
            "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
            "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
            "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
            "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
        ])

    # Use TextBlob for initial sentence tokenization
    blob = TextBlob(text)
    sentences = blob.sentences
    sentence_count = len(sentences)
    if sentence_count == 0: sentence_count = 1 # Avoid division by zero

    # Clean text for word count and sentiment analysis based on your criteria
    cleaned_words = clean_text(text)
    total_cleaned_word_count = len(cleaned_words)

    # Recalculate word count for other metrics (including stopwords and punctuation for sentence length, complex words, etc.)
    # Tokenize using TextBlob's word tokenization for metrics like avg word length, complex word count
    all_words = blob.words

    # Positive/Negative Score based on cleaned words
    pos_score = sum(1 for w in cleaned_words if w in positive_words)
    neg_score = sum(1 for w in cleaned_words if w in negative_words)

    # Polarity Score based on your formula
    polarity = (pos_score - neg_score) / ((pos_score + neg_score )+ 0.000001)

    # Subjectivity Score based on your formula
    subjectivity = (pos_score + neg_score) / ((total_cleaned_word_count) + 0.000001)

    # Average sentence length (based on total words including stopwords, before punctuation removal)
    # This aligns better with a readability index calculation.
    total_words_for_avg_sentence_length = len(all_words)
    avg_sent_len = total_words_for_avg_sentence_length / sentence_count if sentence_count else 0

    # Complex words (based on all words, before cleaning, for readability)
    complex_words = [w for w in all_words if is_complex(w)]
    complex_word_count = len(complex_words)
    # Percentage of Complex words (based on all words)
    percent_complex = (complex_word_count / total_words_for_avg_sentence_length) * 100 if total_words_for_avg_sentence_length else 0

    # Fog Index
    fog_index = 0.4 * (avg_sent_len + percent_complex)

    # Avg words per sentence (same as Average Sentence Length based on total words)
    avg_words_per_sentence = avg_sent_len

    # Syllable Count Per Word (Average) - based on all words
    syllable_per_word = sum(count_syllables(w) for w in all_words) / total_words_for_avg_sentence_length if total_words_for_avg_sentence_length else 0

    # Personal pronouns (based on original text)
    personal_pronouns = count_personal_pronouns(text)

    # Average Word Length (based on all words, characters before cleaning)
    avg_word_len = sum(len(w) for w in all_words) / total_words_for_avg_sentence_length if total_words_for_avg_sentence_length else 0


    return pd.Series([
        pos_score, neg_score, polarity, subjectivity, avg_sent_len, percent_complex,
        fog_index, avg_words_per_sentence, complex_word_count, total_cleaned_word_count, # Use cleaned word count here as specified
        syllable_per_word, personal_pronouns, avg_word_len
    ])

# --- Remainder of the code for loading, applying, and saving ---

# Read Excel
input_file = '/content/sample_data/output_with_articles.xlsx'  # Change to your file
df = pd.read_excel(input_file)

# Apply metrics to the column containing the extracted text (assuming it's the 3rd column, index 2)
text_column_index = 2
df.iloc[:, text_column_index] = df.iloc[:, text_column_index].astype(str)
metrics = df.iloc[:, text_column_index].apply(text_metrics)
metrics.columns = [
    "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT", # This is now the cleaned word count
    "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
]

# Concatenate and save
result = pd.concat([df, metrics], axis=1)
output_file = 'output_with_metrics.xlsx'
result.to_excel(output_file, index=False)
print(f"Analysis complete. Results saved to {output_file}")

Successfully loaded /content/sample_data/negative-words.txt with latin-1 encoding.
Analysis complete. Results saved to output_with_metrics.xlsx
