In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt_tab')
import os
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shaad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:


# Set ChromeDriver path
chrome_driver_path = r"C:\Users\shaad\Desktop\Work\DS_Assignment\chromedriver-win64\chromedriver-win64\chromedriver.exe"

# Read URLs from input.xlsx
input_file = "input.xlsx"
df = pd.read_excel(input_file)

# Create extracted folder if not exists
output_folder = "extracted_script"
os.makedirs(output_folder, exist_ok=True)

# Configure Selenium options
options = Options()
options.headless = True  # Run in headless mode
service = Service(chrome_driver_path)

# Start the WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Iterate through each URL in the Excel file
for index, row in df.iterrows():
    url_id = str(row["URL_ID"])
    url = row["URL"]
    
    try:
        # Open the URL
        driver.get(url)
        time.sleep(5)  # Wait for the page to load completely

        # Extract page source and parse it
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract title
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No Title Found"

        # Extract article content
        article_body = soup.find("div", class_="td-post-content")
        article_text = article_body.get_text(separator="\n", strip=True) if article_body else "No Article Found"

        # Save to text file
        file_path = os.path.join(output_folder, f"{url_id}.txt")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(f"Title: {title}\n\n{article_text}")
        
        print(f"Successfully extracted: {url_id}")
    except Exception as e:
        print(f"Error processing {url_id}: {e}")

# Close WebDriver
driver.quit()
print("Scraping complete!")


In [1]:


# Define paths
EXTRACTED_TEXT_FOLDER = "extracted_script"
STOPWORDS_FOLDER = "StopWords/StopWords"
MASTER_DICT_FOLDER = "MasterDictionary/MasterDictionary"
INPUT_FILE = "Input.xlsx"
OUTPUT_FILE = "Output_nltk_script.xlsx"

# Load stop words

def load_stop_words():
    stop_words = set()
    for file in os.listdir(STOPWORDS_FOLDER):
        with open(os.path.join(STOPWORDS_FOLDER, file), 'r', encoding='utf-8', errors='ignore') as f:
            stop_words.update(f.read().split())
    return stop_words

# Load positive and negative words
def load_master_dict():
    with open(os.path.join(MASTER_DICT_FOLDER, "positive-words.txt"), 'r', encoding='utf-8', errors='ignore') as f:
        positive_words = set(f.read().split())

    with open(os.path.join(MASTER_DICT_FOLDER, "negative-words.txt"), 'r', encoding='utf-8', errors='ignore') as f:
        negative_words = set(f.read().split())

    return positive_words, negative_words


stop_words = load_stop_words()
positive_words, negative_words = load_master_dict()

# Function to clean text
def clean_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return words

# Function for sentiment analysis
def sentiment_analysis(words):
    pos_score = sum(1 for word in words if word in positive_words)
    neg_score = sum(1 for word in words if word in negative_words)
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (len(words) + 0.000001)
    return pos_score, neg_score, polarity_score, subjectivity_score

# Function for readability metrics
def readability_metrics(text):
    sentences = sent_tokenize(text)
    words = clean_text(text)
    num_sentences = len(sentences)
    num_words = len(words)
    complex_words = [word for word in words if syllable_count(word) > 2]
    avg_sentence_length = num_words / num_sentences
    perc_complex_words = len(complex_words) / num_words
    fog_index = 0.4 * (avg_sentence_length + perc_complex_words)
    return avg_sentence_length, perc_complex_words, fog_index, len(complex_words), num_words

# Count syllables in a word
def syllable_count(word):
    return sum(1 for letter in word if letter in "aeiouAEIOU")

# Count personal pronouns
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

# Compute average word length
def avg_word_length(words):
    return sum(len(word) for word in words) / len(words)

# Load input file
input_df = pd.read_excel(INPUT_FILE)

# Process each extracted text file
results = []
for filename in os.listdir(EXTRACTED_TEXT_FOLDER):
    with open(os.path.join(EXTRACTED_TEXT_FOLDER, filename), 'r', encoding='utf-8') as f:
        text = f.read()
    words = clean_text(text)
    pos_score, neg_score, polarity, subjectivity = sentiment_analysis(words)
    avg_sent_len, perc_complex, fog_idx, complex_count, word_count = readability_metrics(text)
    personal_pronouns = count_personal_pronouns(text)
    avg_word_len = avg_word_length(words)
    
    results.append([
        filename, pos_score, neg_score, polarity, subjectivity, avg_sent_len,
        perc_complex, fog_idx, avg_sent_len, complex_count, word_count,
        avg_word_len, personal_pronouns
    ])

# Save output to Excel
output_df = pd.DataFrame(results, columns=[
    "Filename", "Positive Score", "Negative Score", "Polarity Score", "Subjectivity Score",
    "Avg Sentence Length", "Percentage of Complex Words", "Fog Index", "Avg Words Per Sentence",
    "Complex Word Count", "Word Count", "Avg Word Length", "Personal Pronouns"
])
output_df.to_excel(OUTPUT_FILE, index=False)

print("Analysis completed. Output saved to", OUTPUT_FILE)


Analysis completed. Output saved to Output_nltk.xlsx


In [11]:
# Define paths
EXTRACTED_TEXT_FOLDER = "extracted_content"
STOPWORDS_FOLDER = "StopWords"
MASTER_DICT_FOLDER = "MasterDictionary"
INPUT_FILE = "Input.xlsx"
OUTPUT_FILE = "Output_Shaad_Fazal.xlsx"

# Load stop words
def load_stop_words():
    stop_words = set()
    for file in os.listdir(STOPWORDS_FOLDER):
        with open(os.path.join(STOPWORDS_FOLDER, file), 'r', encoding='utf-8', errors='ignore') as f:
            stop_words.update(f.read().split())
    return stop_words

# Load positive and negative words
def load_master_dict():
    with open(os.path.join(MASTER_DICT_FOLDER, "positive-words.txt"), 'r', encoding='utf-8', errors='ignore') as f:
        positive_words = set(f.read().split())

    with open(os.path.join(MASTER_DICT_FOLDER, "negative-words.txt"), 'r', encoding='utf-8', errors='ignore') as f:
        negative_words = set(f.read().split())

    return positive_words, negative_words


stop_words = load_stop_words()
positive_words, negative_words = load_master_dict()

# Function to clean text
def clean_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return words

# Function for sentiment analysis
def sentiment_analysis(words):
    pos_score = sum(1 for word in words if word in positive_words)
    neg_score = sum(1 for word in words if word in negative_words)
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (len(words) + 0.000001)
    return pos_score, neg_score, polarity_score, subjectivity_score

# Function for readability metrics
def readability_metrics(text):
    sentences = sent_tokenize(text)
    words = clean_text(text)
    num_sentences = len(sentences)
    num_words = len(words)
    complex_words = [word for word in words if syllable_count(word) > 2]
    avg_sentence_length = num_words / num_sentences
    perc_complex_words = len(complex_words) / num_words
    fog_index = 0.4 * (avg_sentence_length + perc_complex_words)
    return avg_sentence_length, perc_complex_words, fog_index, len(complex_words), num_words

# Count syllables in a word
def syllable_count(word):
    return sum(1 for letter in word if letter in "aeiouAEIOU")

# Count personal pronouns
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

# Compute average word length
def avg_word_length(words):
    return sum(len(word) for word in words) / len(words)

# Load input file
input_df = pd.read_excel(INPUT_FILE)

# Process each extracted text file
results = []
for index, row in input_df.iterrows():
    filename = f"{row['URL_ID']}.txt"  # Assuming the URL_ID corresponds to the filename
    url = row['URL']  # Get the URL for the current row
    file_path = os.path.join(EXTRACTED_TEXT_FOLDER, filename)

    # Check if the file exists
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        words = clean_text(text)
        pos_score, neg_score, polarity, subjectivity = sentiment_analysis(words)
        avg_sent_len, perc_complex, fog_idx, complex_count, word_count = readability_metrics(text)
        syllable_per_word = sum(syllable_count(word) for word in words) / len(words)  # Calculating syllables per word
        personal_pronouns = count_personal_pronouns(text)
        avg_word_len = avg_word_length(words)

        results.append([
            row["URL_ID"], url, pos_score, neg_score, polarity, subjectivity, avg_sent_len,
            perc_complex, fog_idx, avg_sent_len, complex_count, word_count,
            syllable_per_word, personal_pronouns, avg_word_len
        ])

# Save output to Excel
output_df = pd.DataFrame(results, columns=[
    "URL_ID", "URL", "Positive Score", "Negative Score", "Polarity Score", "Subjectivity Score",
    "Avg Sentence Length", "Percentage of Complex Words", "Fog Index", "Avg Number Words Per Sentence",
    "Complex Word Count", "Word Count", "Syllable Per Word", "Personal Pronouns", "Avg Word Length"
])
output_df.to_excel(OUTPUT_FILE, index=False)

print("Analysis completed. Output saved to", OUTPUT_FILE)

Analysis completed. Output saved to Output_script.xlsx
