In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Function to read stop words from files
def read_stop_words(files):
    stop_words = set()
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            stop_words.update(word.strip().lower() for word in f.readlines())
    return stop_words

# Function to clean text
def clean_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    words = [word.lower() for word in words]
    return words

# Function for NLP analysis
def nlp_analysis(text):
    stop_words_files = [
        "StopWords_Auditor.txt",
        "StopWords_currencies.txt",
        "StopWords_DatesandNumbers.txt",
        "StopWords_Generic.txt",
        "StopWords_GenericLong.txt",
        "StopWords_Names.txt",
        "StopWords_Geographic.txt"
    ]
    stop_words = read_stop_words(stop_words_files)

    positive_words_file = "positive-words.txt"
    negative_words_file = "negative-words.txt"

    with open(positive_words_file, 'r', encoding='utf-8') as f:
        positive_words = set(word.strip().lower() for word in f.readlines())

    with open(negative_words_file, 'r', encoding='utf-8') as f:
        negative_words = set(word.strip().lower() for word in f.readlines())

    cleaned_text = clean_text(text)

    positive_score = sum(1 for word in cleaned_text if word in positive_words and word not in stop_words)
    negative_score = sum(1 for word in cleaned_text if word in negative_words and word not in stop_words)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_text) + 0.000001)

    sentences = sent_tokenize(text)
    total_words = len(cleaned_text)
    total_sentences = len(sentences)

    average_sentence_length = total_words / total_sentences

    complex_words = [word for word in cleaned_text if syllable_count(word) > 2]
    percentage_complex_words = len(complex_words) / total_words * 100

    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    average_words_per_sentence = total_words / total_sentences

    complex_word_count = len(complex_words)

    syllable_counts = [syllable_count(word) for word in cleaned_text]
    syllable_count_per_word = sum(syllable_counts) / len(cleaned_text)

    personal_pronouns = sum(1 for word in cleaned_text if word.lower() in ['i', 'we', 'my', 'ours', 'us'])

    total_characters = sum(len(word) for word in cleaned_text)
    average_word_length = total_characters / total_words

    analysis_result = {
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Average Sentence Length": average_sentence_length,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Average Number of Words Per Sentence": average_words_per_sentence,
        "Complex Word Count": complex_word_count,
        "Word Count": total_words,
        "Syllable Count Per Word": syllable_count_per_word,
        "Personal Pronouns": personal_pronouns,
        "Average Word Length": average_word_length
    }

    return analysis_result

def syllable_count(word):
    vowels = "aeiouy"
    count = 0
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        count += 1
    return count

def process_url_entry(url_id, url):
    print(f"Processing URL ID: {url_id}")
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(
            lambda driver: driver.execute_script('return document.readyState') == 'complete'
        )

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        extracted_text = soup.get_text(separator='\n', strip=True)
        analysis_result = nlp_analysis(extracted_text)
        print("NLP Analysis Result:")
        for key, value in analysis_result.items():
            print(f"{key}: {value}")

        save_to_csv(url_id, url, analysis_result)

    except TimeoutException:
        print("Timed out waiting for page to load")
    finally:
        driver.quit()

def save_to_csv(url_id, url, analysis_result):
    new_data = {
        'URL_ID': [url_id],
        'URL': [url],
        'POSITIVE SCORE': [analysis_result['Positive Score']],
        'NEGATIVE SCORE': [analysis_result['Negative Score']],
        'POLARITY SCORE': [analysis_result['Polarity Score']],
        'SUBJECTIVITY SCORE': [analysis_result['Subjectivity Score']],
        'AVG SENTENCE LENGTH': [analysis_result['Average Sentence Length']],
        'PERCENTAGE OF COMPLEX WORDS': [analysis_result['Percentage of Complex Words']],
        'FOG INDEX': [analysis_result['Fog Index']],
        'AVG NUMBER OF WORDS PER SENTENCE': [analysis_result['Average Number of Words Per Sentence']],
        'COMPLEX WORD COUNT': [analysis_result['Complex Word Count']],
        'WORD COUNT': [analysis_result['Word Count']],
        'SYLLABLE PER WORD': [analysis_result['Syllable Count Per Word']],
        'PERSONAL PRONOUNS': [analysis_result['Personal Pronouns']],
        'AVG WORD LENGTH': [analysis_result['Average Word Length']]
    }

    new_df = pd.DataFrame(new_data)
    csv_file = 'x.csv'
    updated_csv_df = create_or_update_csv_file(csv_file, new_df)
    print("Last few rows of the updated CSV file:")
    print(updated_csv_df.tail())
    print(f"Data has been successfully added to '{csv_file}'")

def create_or_update_csv_file(file_path, new_data):
    try:
        existing_df = pd.read_csv(file_path)
        updated_df = pd.concat([existing_df, new_data], ignore_index=True)
    except FileNotFoundError:
        updated_df = new_data

    updated_df.to_csv(file_path, index=False)
    return updated_df

input_df = pd.read_excel('Input.xlsx')

for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    process_url_entry(url_id, url)
