In [3]:
import nltk
for pkg in ['punkt', 'stopwords']:
    try:
        nltk.data.find(pkg)
    except Exception:
        nltk.download(pkg)

print('NLTK packages ensured')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell8\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell8\AppData\Roaming\nltk_data...


NLTK packages ensured


[nltk_data]   Unzipping corpora\stopwords.zip.


In [10]:
import os, re, math
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from tqdm import tqdm
import textstat
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize



In [12]:
BASE_DIR = Path('.')
STOPWORDS_DIR = Path('StopWords')
MASTER_DICT_DIR = Path('MasterDictionary')
ARTICLES_DIR = Path('extracted_articles')
ARTICLES_DIR.mkdir(exist_ok=True)

In [13]:
def load_stopwords(stopwords_dir=STOPWORDS_DIR):
    sw = set()
    if not stopwords_dir.exists():
        print('StopWords folder not found:', stopwords_dir)
        return sw
    for f in stopwords_dir.glob('*.txt'):
        with open(f, encoding='utf-8', errors='ignore') as fh:
            for line in fh:
                w = line.strip()
                if w:
                    sw.add(w.lower())
    # also add nltk stopwords
    try:
        sw.update([w.lower() for w in stopwords.words('english')])
    except Exception:
        pass
    return sw

In [14]:
def load_sentiment_dicts(master_dir=MASTER_DICT_DIR):
    pos=set(); neg=set()
    posf = master_dir / 'positive words.txt'
    negf = master_dir / 'negative words.txt'
    # tolerant naming
    for f in master_dir.glob('*.txt'):
        name = f.name.lower()
        if 'positive' in name:
            with open(f, encoding='utf-8', errors='ignore') as fh:
                pos.update({ln.strip().lower() for ln in fh if ln.strip()})
        if 'negative' in name or 'negetive' in name:
            with open(f, encoding='utf-8', errors='ignore') as fh:
                neg.update({ln.strip().lower() for ln in fh if ln.strip()})
    return pos, neg

In [15]:
STOPWORDS = load_stopwords()
POS_DICT, NEG_DICT = load_sentiment_dicts()

print('stopwords loaded:', len(STOPWORDS), 'pos words:', len(POS_DICT), 'neg words:', len(NEG_DICT))


stopwords loaded: 12797 pos words: 2006 neg words: 4783


In [35]:
def fetch_html(url, timeout=30):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()
    return r.text

In [36]:
def extract_title_and_body(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Title
    title = ''
    if soup.title:
        title = soup.title.get_text(strip=True)
    # Try <article> tags
    body_text = ''
    articles = soup.find_all('article')
    if articles:
        paras = []
        for a in articles:
            paras += [p.get_text(' ', strip=True) for p in a.find_all('p')]
        body_text = '\n'.join([p for p in paras if p and len(p)>30])
    # Fallbacks: look for common container ids/classes
    if not body_text:
        candidates = []
        for key in ['article', 'main', 'content', 'post', 'story', 'post-body', 'entry-content']:
            el = soup.find(attrs={'id': re.compile(key, re.I)}) or soup.find(attrs={'class': re.compile(key, re.I)})
            if el:
                candidates.append(el)
        if candidates:
            paras=[]
            for c in candidates:
                paras += [p.get_text(' ', strip=True) for p in c.find_all('p')]
            body_text = '\n'.join([p for p in paras if p and len(p)>30])
    # Final fallback: all <p> longer than 40 chars
    if not body_text:
        paras = soup.find_all('p')
        body_text = '\n'.join([p.get_text(' ', strip=True) for p in paras if len(p.get_text(strip=True))>40])
    body_text = re.sub(r'\n{2,}', '\n\n', body_text).strip()
    return title.strip(), body_text.strip()

In [37]:
def save_article_text(url_id, title, body, out_dir=ARTICLES_DIR):
    out_dir.mkdir(exist_ok=True)
    fname = out_dir / f"{url_id}.txt"
    with open(fname, 'w', encoding='utf-8') as fh:
        if title:
            fh.write(title + '\n\n')
        fh.write(body)
    return fname

print('Scraping helper ready')

Scraping helper ready


In [38]:
PERSONAL_PRONOUNS_RE = re.compile(r"\b(I|we|We|WE|us|Us|US|my|My|our|Our|ours|Ours|me|Me)\b")

def clean_text_for_counting(text, stopwords_set=STOPWORDS):
    # remove URLs, emails
    text = re.sub(r'https?://\S+|www\.\S+',' ', text)
    text = re.sub(r'\S+@\S+',' ', text)
    # remove punctuation except keep word separators
    text = re.sub(r"[^A-Za-z0-9\s]", ' ', text)
    tokens = [t.lower() for t in word_tokenize(text) if re.search('[A-Za-z0-9]', t)]
    tokens_cleaned = [t for t in tokens if t not in stopwords_set]
    return tokens_cleaned

In [39]:
def count_syllables(word):
    w = word.lower()
    w = re.sub(r'[^a-z]','', w)
    if not w:
        return 0
    if len(w) <= 3:
        return 1
    vowels = 'aeiou'
    count = 0
    prev_v = False
    for ch in w:
        is_v = ch in vowels
        if is_v and not prev_v:
            count += 1
        prev_v = is_v

    if w.endswith('es') or w.endswith('ed'):
        if count > 1:
            count -= 1

    if w.endswith('e') and not w.endswith('le') and count>1:
        count -=1
    return max(1, count)


In [40]:
def compute_metrics(text, pos_dict=POS_DICT, neg_dict=NEG_DICT, stopwords_set=STOPWORDS):

    sentences = sent_tokenize(text)
    tokens = clean_text_for_counting(text, stopwords_set)
    total_words = len(tokens)
    total_sentences = len(sentences) if len(sentences)>0 else 1


    pos_count = sum(1 for w in tokens if w in pos_dict)
    neg_count = sum(1 for w in tokens if w in neg_dict)
    neg_count_positive = neg_count  


    polarity = 0.0
    if (pos_count + neg_count) != 0:
        polarity = (pos_count - neg_count) / ((pos_count + neg_count) + 1e-6)
    subjectivity = 0.0
    if total_words != 0:
        subjectivity = (pos_count + neg_count) / (total_words + 1e-6)


    avg_sentence_length = total_words / total_sentences if total_sentences else 0.0


    syll_counts = [count_syllables(w) for w in tokens]
    total_syllables = sum(syll_counts)
    syll_per_word = total_syllables / total_words if total_words else 0.0


    complex_mask = [1 for s in syll_counts if s > 2]
    complex_word_count = sum(complex_mask)
    pct_complex = (complex_word_count / total_words)*100 if total_words else 0.0


    fog_index = 0.4 * (avg_sentence_length + pct_complex)


    avg_words_per_sentence = avg_sentence_length


    personal_pronouns = len(PERSONAL_PRONOUNS_RE.findall(text))

    personal_pronouns -= len(re.findall(r'\bUS\b', text))
    if personal_pronouns < 0:
        personal_pronouns = 0


    word_lengths = [len(re.sub(r'[^A-Za-z0-9]','', w)) for w in tokens if re.sub(r'[^A-Za-z0-9]','', w)]
    avg_word_length = sum(word_lengths)/len(word_lengths) if word_lengths else 0.0


    metrics = {
        'POSITIVE SCORE': pos_count,
        'NEGATIVE SCORE': neg_count_positive,
        'POLARITY SCORE': round(polarity, 6),
        'SUBJECTIVITY SCORE': round(subjectivity, 6),
        'AVG SENTENCE LENGTH': round(avg_sentence_length, 6),
        'PERCENTAGE OF COMPLEX WORDS': round(pct_complex, 6),
        'FOG INDEX': round(fog_index, 6),
        'AVG NUMBER OF WORDS PER SENTENCE': round(avg_words_per_sentence, 6),
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': total_words,
        'SYLLABLE PER WORD': round(syll_per_word, 6),
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': round(avg_word_length, 6)
    }
    return metrics

print('Cleaning and metric functions ready')

Cleaning and metric functions ready


In [41]:
INPUT_FILE = 'Input.xlsx'  
OUTPUT_FILE = 'output_result.xlsx'
ARTICLES_DIR = Path('extracted_articles')
ARTICLES_DIR.mkdir(exist_ok=True)
ERR_LOG = 'errors.log'

In [42]:
if not Path(INPUT_FILE).exists():
    print('Input.xlsx not found in current folder. Place it here and re-run this cell.')
else:
    df = pd.read_excel(INPUT_FILE)
    required_cols = ['URL_ID','URL']
    if not all(c in df.columns for c in required_cols):
        print('Input.xlsx must contain columns URL_ID and URL')
    else:
        rows = []
        errors = []
        for _, r in tqdm(df.iterrows(), total=len(df)):
            url = str(r['URL']).strip()
            url_id = str(r['URL_ID']).strip()
            try:
                html = fetch_html(url)
                title, body = extract_title_and_body(html)
                if not body or len(body) < 50:
                    errors.append((url_id, url, 'ExtractionTooShort'))
                save_article_text(url_id, title, body)
                full_text = (title + '\n\n' + body).strip()
                metrics = compute_metrics(full_text)
                out_row = r.to_dict()
                # append metrics in exact order
                order = ['POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE',
                         'AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX',
                         'AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT',
                         'SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH']
                for k in order:
                    out_row[k] = metrics.get(k, '')
                rows.append(out_row)
            except Exception as e:
                errors.append((url_id, url, str(e)))
                # create blank metric row (so output row count matches input)
                out_row = r.to_dict()
                for k in ['POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE',
                          'AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX',
                          'AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT',
                          'SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH']:
                    out_row[k] = ''
                rows.append(out_row)

        out_df = pd.DataFrame(rows)
        # reorder columns: original input columns then metrics in order
        in_cols = list(df.columns)
        metric_cols = ['POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE',
                       'AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX',
                       'AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT',
                       'SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH']
        final_cols = in_cols + metric_cols
        out_df = out_df[final_cols]
        out_df.to_excel(OUTPUT_FILE, index=False)
        with open(ERR_LOG, 'w', encoding='utf-8') as fh:
            for e in errors:
                fh.write('\t'.join(map(str,e)) + '\n')
        print('Processing complete. Output:', OUTPUT_FILE)
        print('Articles saved to:', ARTICLES_DIR)
        print('Errors logged to:', ERR_LOG)


100%|████████████████████████████████████████████████████████████████████████████████| 147/147 [15:55<00:00,  6.50s/it]

Processing complete. Output: output_result.xlsx
Articles saved to: extracted_articles
Errors logged to: errors.log



