In [13]:
import requests
import os
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

In [6]:
def extract_article_text_selenium(url, driver):
    try:
        #Open the URL
        driver.get(url)
        driver.implicitly_wait(10)
        #Get the page source
        page_source = driver.page_source

        #parse the HTML content
        soup = BeautifulSoup(page_source, 'html.parser')

        paragraphs = soup.find_all('p')

        article_text = '\n'.join([paragraph.get_text() for paragraph in paragraphs])

        return article_text
    except Exception as e:
        print(f"Error extracting article text: {str(e)}")
        return None

def scrape_and_save_selenium(input_file_path, output_folder_path, chrome_driver_path):
    #Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    #Set up the Chrome service
    service = ChromeService(chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Set a timeout of 30 seconds (adjust as needed)

    df = pd.read_excel(input_file_path)

    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        try:
            article_text = extract_article_text_selenium(url, driver)

            if article_text:
                # Create a folder with the URL_ID if it doesn't exist
                folder_path = os.path.join(output_folder_path, f"{url_id}")
                os.makedirs(folder_path, exist_ok=True)

                with open(os.path.join(folder_path, f"{url_id}.txt"), "w", encoding="utf-8") as file:
                    file.write(article_text)

                print(f"Scraping completed for URL_ID: {url_id}")
            else:
                print(f"Skipping URL_ID {url_id} due to extraction error.")
        except TimeoutException:
            print(f"Timeout while loading the page: {url}")
        except Exception as e:
            print(f"Error scraping URL_ID {url_id}: {str(e)}")

    driver.quit()

In [7]:
output_folder_path=("D:\internship\output")
chrome_driver_path=("D:\internship\chromedriver.exe")
input_file_path = ("Input.xlsx")


In [8]:
scrape_and_save_selenium(input_file_path, output_folder_path, chrome_driver_path)


Scraping completed for URL_ID: blackassign0001
Scraping completed for URL_ID: blackassign0002
Scraping completed for URL_ID: blackassign0003
Scraping completed for URL_ID: blackassign0004
Scraping completed for URL_ID: blackassign0005
Scraping completed for URL_ID: blackassign0006
Scraping completed for URL_ID: blackassign0007
Scraping completed for URL_ID: blackassign0008
Scraping completed for URL_ID: blackassign0009
Scraping completed for URL_ID: blackassign0010
Scraping completed for URL_ID: blackassign0011
Scraping completed for URL_ID: blackassign0012
Scraping completed for URL_ID: blackassign0013
Scraping completed for URL_ID: blackassign0014
Scraping completed for URL_ID: blackassign0015
Scraping completed for URL_ID: blackassign0016
Scraping completed for URL_ID: blackassign0017
Scraping completed for URL_ID: blackassign0018
Scraping completed for URL_ID: blackassign0019
Scraping completed for URL_ID: blackassign0020
Scraping completed for URL_ID: blackassign0021
Scraping comp

In [68]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_Auditor.txt"

def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = [line.strip() for line in stopwords_lines]
    return set(stop_words)

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
a=(stop_words)
print(a)


Stop Words:
{'YOUNG', 'TOUCHE', 'PRICEWATERHOUSECOOPERS', 'KPMG', 'COOPERS', 'PRICEWATERHOUSE', 'ERNST', 'DELOITTE'}


In [69]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_Generic.txt"

def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = [line.strip() for line in stopwords_lines]
    return set(stop_words)

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
b=(stop_words)
print(b)

Stop Words:
{'ON', 'WHICH', 'HAVING', 'WE', 'AT', 'HAVE', 'ONLY', 'THERE', 'HAD', 'AGAIN', 'HIS', 'THE', 'IS', 'WHERE', 'MOST', 'HERSELF', 'THEY', 'ABOUT', 'SAME', 'DOING', 'VERY', 'NO', 'HER', 'ARE', 'WITH', 'HAS', 'AS', 'INTO', 'NOR', 'ITS', 'WHAT', 'OVER', 'WAS', 'THEIRS', 'WHILE', 'FROM', 'TOO', 'ONCE', 'YOURSELF', 'UP', 'UNTIL', 'ITSELF', 'OUT', 'SHOULD', 'IN', 'THEM', 'MY', 'THEMSELVES', 'BUT', 'OWN', 'ANY', 'AFTER', 'OF', 'BELOW', 'YOURS', 'CAN', 'FEW', 'OURS', 'EACH', 'SHE', 'OTHER', 'THEN', 'WHOM', 'ME', 'AN', 'SO', 'IF', 'SOME', 'THESE', 'THOSE', 'OR', 'WHY', 'ALL', 'DID', 'BOTH', 'IT', 'NOW', 'THAT', 'YOU', 'FURTHER', 'DOES', 'BY', 'MYSELF', 'MORE', 'HE', 'BECAUSE', 'ABOVE', 'THAN', 'OFF', 'DOWN', 'HIMSELF', 'OURSELVES', 'DO', 'DURING', 'SUCH', 'YOUR', 'BE', 'HIM', 'UNDER', 'HERS', 'BEING', 'HERE', 'AM', 'BEFORE', 'THROUGH', 'HOW', 'NOT', 'AMONG', 'WHEN', 'WHO', 'THIS', 'THEIR', 'TO', 'AND', 'YOURSELVES', 'JUST', 'WERE', 'FOR', 'OUR', 'BEEN', 'BETWEEN'}


In [70]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_GenericLong.txt"

def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = [line.strip() for line in stopwords_lines]
    return set(stop_words)

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
c=(stop_words)
print(c)

Stop Words:
{'keep', 'ones', 'wonder', 'oh', 'accordingly', 'becomes', 'downwards', 'happens', 'through', 'ltd', 'perhaps', 'was', 'namely', 'u', 'only', 'believe', 'how', 'saying', 'un', 'are', 'further', 'mean', 'both', 'whereupon', 'w', 'when', 'clearly', 'relatively', "they'd", 'what', "they've", 'x', 'getting', 'new', 'etc', 'taken', 'upon', 'had', 'hers', 'particularly', 'his', "isn't", 'appreciate', 'really', 'there', 'towards', 'concerning', "i'd", 'ex', 'definitely', 'sometime', 'said', 'thanx', "we're", 'whence', 'g', "doesn't", 'want', "c's", 'contains', 'eg', 'yours', 'look', 'on', "a's", 'o', 'else', 'beyond', 'having', 'where', 'exactly', 'although', 'uucp', 'indicated', 'cannot', 'sent', 'according', 'former', 'specifying', 'onto', 'go', 'que', 'follows', 'entirely', 'they', 'p', 'zero', 'apart', 'therein', 'nevertheless', 'seemed', 'consider', 'necessary', 'another', 'third', 'tends', 'the', 'while', 'goes', "they're", 'into', 'often', 'somewhat', 'went', 'least', 'besi

**for stop words which were in WORD | Category**

In [71]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_Currencies.txt"

def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = [line.split('|')[0].strip() for line in stopwords_lines]
    return set(stop_words)

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
d=(stop_words)
print(d)

Stop Words:
{'FLORIN', 'LIRA', 'LOTI', 'KINA', 'KUNA', 'NGULTRUM', 'PATACA', 'DIRHAM', 'DINAR', 'GULDEN', 'DENAR', 'POUND', 'BOLIVAR', 'KIP', 'KROON', 'CÓRDOBA', 'MANAT', 'BALBOA', 'KRONA', 'TAKA', 'TALA', 'RIEL', 'METICAL', 'NAIRA', 'BOLIVIANO', 'SOMONI', 'SHILLING', 'BAHT', 'BIRR', 'RENMINBI', 'DONG', 'NUEVO SOL', 'RAND', 'ARIARY', 'GOURDE', 'LEONE', 'DOBRA', 'FORINT', 'NAKFA', 'KRONE', 'PESO', 'LEK', 'LATS', 'OUGUIYA', 'KORUNA', 'RUPIAH', 'PULA', 'RIYAL', 'EURO', 'DALASI', 'KWANZA', 'REAL', 'TUGRIK', 'LEMPIRA', 'RINGGIT', 'SOM', 'RUPEE', 'YEN', 'KYAT', 'HRYVNIA', 'RUFIYAA', 'LITAS', 'LEU', 'LARI', 'QUETZAL', 'WON', 'KWACHA', 'GUARANI', 'SPECIAL DRAWING RIGHTS', 'TENGE', 'COLON', 'KONVERTIBILNA MARKA', 'NEW LIRA', 'CEDI', 'ESCUDO', 'AFGHANI', 'VATU', 'DRAM', 'LILANGENI', 'RIAL', 'RUBLE', 'NEW SHEQEL', 'LEV', 'ZLOTY'}


**for stop words which were in WORD | Category form as well as normal words**

In [72]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_DatesandNumbers.txt"
def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = set()
    for line in stopwords_lines:
        parts = line.split('|')
        
        if len(parts) > 1:
            word = parts[0].strip()
            stop_words.add(word)
        else:
            word = line.strip()
            stop_words.add(word)

    return stop_words

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
e=(stop_words)
print(e)

Stop Words:
{'II', 'NOV', 'JUN', 'NINTH', 'FOUR', 'HUNDRED', 'WEEK', 'TRILLION', 'APRIL', 'SIXTEEN', 'NOVEMBER', 'DAILY', 'QUARTERLY', 'XI', 'SEVENTH', 'X', 'YEARLY', 'OCTOBER', 'SUNDAY', 'I', 'FIFTY', 'IV', 'JULY', 'MARCH', 'TWO', 'MAR', 'WEEKLY', 'XVI', 'AUG', 'JAN', 'XX', 'SEPTEMBER', 'MONDAY', 'MONTH', 'FORTY', 'ANNUM', 'EIGHTH', 'THREE', 'JUL', 'DECEMBER', 'XVIII', 'FIFTH', 'XIX', 'SEVENTY', 'FIRST', 'III', 'OCT', 'SEVEN', 'THOUSAND', 'TWENTY', 'NINE', 'ONE', 'EIGHTY', 'TWELVE', 'FRIDAY', 'FEBRUARY', 'YEAR', 'SIXTY', 'TEN', 'FIFTEEN', 'NINETY', 'TENTH', 'THIRTY', 'IX', 'MAY', 'FOURTH', 'SATURDAY', 'JUNE', 'XII', 'ANNUAL', 'SIX', 'XIII', 'SEVENTEEN', 'VII', 'SEPT', 'QUARTER', 'WEDNESDAY', 'AUGUST', 'FIVE', 'V', 'XV', 'VI', 'NINETEEN', 'EIGHTEEN', 'SIXTH', 'THIRTEEN', 'JANUARY', 'SECOND', 'DATE', 'SEP', 'ANNUALLY', 'TUESDAY', 'XIV', 'THIRD', 'VIII', 'FOURTEEN', 'XVII', 'DEC', 'THURSDAY', 'MILLION', 'APR', 'BILLION', 'MONTHLY', 'FEB', 'ELEVEN', 'QTR', 'EIGHT', 'DAY'}


In [73]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_Geographic.txt"
def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = set()
    for line in stopwords_lines:
        parts = line.split('|')
        if len(parts) > 1:
            word = parts[0].strip()
            stop_words.add(word)
        else:
            word = line.strip()
            stop_words.add(word)

    return stop_words

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
f=(stop_words)
print(f)

Stop Words:
{'WEST', 'NORTHEAST', 'ANGELES', 'SOUTHEAST', 'KARACHI', 'BANGLADESH', 'MEXICO', 'ISLAND', 'VIRGINIA', 'IDAHO', 'DETROIT', 'MEXICAN', 'COLUMBUS', 'DENMARK', 'ILLINOIS', 'NOVA', 'STATE', 'IOWA', 'FLORIDA', 'OREGON', 'SOUTHWEST', 'GERMANY', 'KOREA', 'SEA', 'SAO', 'SWITZERLAND', 'INDIANA', 'DALLAS', 'ALASKA', 'MUMBAI', 'SUISSE', 'BELGIUM', 'CZECK', 'TAIWAN', 'LUXEMBOURG', 'CHINA', 'DAKOTA', 'MINNEAPOLIS', 'VIETNAM', 'NIGERIA', 'COLUMBIA', 'TURKEY', 'MONTANA', 'DELHI', 'MISSOURI', 'MALAYSIA', 'ATLANTA', 'NETHERLANDS', 'BLVD', 'OHIO', 'LOUIS', 'LAKE', 'CHICAGO', 'HOLLYWOOD', 'SINGAPORE', 'ARIZONA', 'CALIFORNIA', 'FRANSICO', 'MANHATTAN', 'RICA', 'PHILIPPINES', 'ASIA', 'DIEGO', 'MEMPHIS', 'RIVER', 'PITTSBURGH', 'FRANCE', 'EUROPEAN', 'HOUSTON', 'BEIJING', 'HAMPSHIRE', 'WISCONSIN', 'OKLAHOMA', 'NEBRASKA', 'ITALIAN', 'PORTUGAL', 'JERSEY', 'YORK', 'STREET', 'VENEZUELA', 'RUSSIA', 'VERMONT', 'PARKWAY', 'MIAMI', 'REPUBLIC', 'CHILEAN', 'JAPANESE', 'BERMUDA', 'TEXAS', 'MEDITERRANEAN', 'SC

In [74]:
stopwords_file_path = r"D:\internship\StopWords\StopWords_Names.txt"
def read_stopwords_from_file(stopwords_file_path):
    with open(stopwords_file_path, 'r', encoding='latin-1') as file:
        stopwords_lines = file.readlines()
    stop_words = set()

    for line in stopwords_lines:
        parts = line.split('|')
        
        if len(parts) > 1:
            word = parts[0].strip()
            stop_words.add(word)
        else:
            word = line.strip()
            stop_words.add(word)
    return stop_words

stop_words = read_stopwords_from_file(stopwords_file_path)

print("Stop Words:")
g=(stop_words)
print(g)

Stop Words:
{'KALA', 'OVERTON', 'HENDRIX', 'KHOURY', 'LATOSHA', 'LE', 'BALDERAS', 'RULE', 'JUSTINE', 'MATTHEWS', 'CYR', 'MUNGUIA', 'SHAROLYN', 'BRUNA', 'JOESPH', 'BOLDEN', 'MACKENZIE', 'MCMANUS', 'IVANA', 'PARENT', 'SAMUAL', 'TURNER', 'GERMANY', 'SULLIVAN', 'CORNWELL', 'EARNEST', 'KARINA', 'SNYDER', 'WILMOT', 'YOLONDA', 'HOSTETTER', 'JACOB', 'JABLONSKI', 'MARIANN', 'HERZOG', 'SHUMWAY', 'VICKERY', 'MACHELLE', 'MCLAIN', 'DANYELL', 'JENNIFER', 'HAMILTON', 'AUDRIE', 'KARLEEN', 'BEESON', 'RAYMON', 'DEVILLE', 'LEANORA', 'MARTINE', 'PALMER', 'GILBERTE', 'AMADA', 'LEZLIE', 'KINDER', 'JOLINE', 'LEXIE', 'DOLAN', 'GISELE', 'JACQUIE', 'FELDER', 'BRADBURY', 'CUSHING', 'FORTENBERRY', 'KAILA', 'LAROCHE', 'COWARD', 'MICKELSON', 'RIVA', 'NEGRETE', 'ORELLANA', 'FERNANDA', 'MERISSA', 'WILLOUGHBY', 'ADAMS', 'BELL', 'SNOW', 'TIESHA', 'CARLYN', 'SHEMEKA', 'SETZER', 'BEERY', 'LERNER', 'MINTER', 'ALVES', 'BAUGHER', 'PIERSON', 'MCGILL', 'OTILIA', 'SCANLON', 'BECNEL', 'GODWIN', 'DESANTIS', 'DARIO', 'TASHA', 'CH

In [75]:
combine=a.union(b,c,d,e,f,g)
combine

{'KALA',
 'OVERTON',
 'HENDRIX',
 'KHOURY',
 'LATOSHA',
 'LE',
 'BALDERAS',
 'RULE',
 'JUSTINE',
 'MATTHEWS',
 'CYR',
 'MUNGUIA',
 'SHAROLYN',
 'BRUNA',
 'JOESPH',
 'ltd',
 'BOLDEN',
 'MACKENZIE',
 'MCMANUS',
 'IVANA',
 'PARENT',
 'SAMUAL',
 'TURNER',
 'GERMANY',
 'SULLIVAN',
 'CORNWELL',
 'EARNEST',
 'KARINA',
 'SNYDER',
 'WILMOT',
 'YOLONDA',
 'HOSTETTER',
 'JACOB',
 'JABLONSKI',
 'MARIANN',
 'HERZOG',
 'SHUMWAY',
 'VICKERY',
 'MACHELLE',
 'MCLAIN',
 'DANYELL',
 'JENNIFER',
 'HAMILTON',
 'AUDRIE',
 'KARLEEN',
 'BEESON',
 'RAYMON',
 'DEVILLE',
 'LEANORA',
 'MARTINE',
 'PALMER',
 'GILBERTE',
 'AMADA',
 'LEZLIE',
 'KINDER',
 'JOLINE',
 'LEXIE',
 'DOLAN',
 'GISELE',
 'JACQUIE',
 'FELDER',
 'BRADBURY',
 'CUSHING',
 'FORTENBERRY',
 'KAILA',
 'LAROCHE',
 'COWARD',
 'MICKELSON',
 'OKLAHOMA',
 'RIVA',
 'NEGRETE',
 'ORELLANA',
 'FERNANDA',
 'MERISSA',
 'WILLOUGHBY',
 'ADAMS',
 'BELL',
 'SNOW',
 'TIESHA',
 'CARLYN',
 'SHEMEKA',
 'SETZER',
 'BEERY',
 'LERNER',
 'MINTER',
 'ALVES',
 'BAUGHER',
 '

**cleaning the articles by removing the stop words**

In [76]:
def clean_text(article_text, combine):
    words = article_text.split()
    cleaned_words = [word for word in words if word.lower() not in combine]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

def clean_articles(input_root_folder, stopwords_set):
    for folder_name in os.listdir(input_root_folder):
        folder_path = os.path.join(input_root_folder, folder_name)

        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                if file_name.endswith('.txt'):
                    file_path = os.path.join(folder_path, file_name)

                    with open(file_path, 'r', encoding='utf-8') as file:
                        article_text = file.read()

                    cleaned_text = clean_text(article_text, stopwords_set)

                    # Save the cleaned text back to the file
                    with open(file_path, 'w', encoding='utf-8') as file:
                        file.write(cleaned_text)

input_root_folder = "D:\internship\output"
clean_articles(input_root_folder, combine)


*code to check if the articles are cleaned and display the uncleaned articles*

In [77]:
def clean_text(article_text, combine):
    words = article_text.split() 
    cleaned_words = [word for word in words if word.lower() not in combine]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

def check_cleaned_status(folder_path, stopwords_set):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                original_text = file.read()

            cleaned_text = clean_text(original_text, stopwords_set)
            if original_text != cleaned_text:
                print(f"Article in file {file_path} is not cleaned.")

root_folder = "D:\internship\output"

for folder_name in os.listdir(root_folder):
    folder_path = os.path.join(root_folder, folder_name)
    
    if os.path.isdir(folder_path):
        check_cleaned_status(folder_path, combine)



**creating document for positive words**

In [78]:
from docx import Document

def read_positive_words_from_docx(docx_path):
    positive_words = set()

    doc = Document(docx_path)
    for paragraph in doc.paragraphs:
        words = paragraph.text.split()
        positive_words.update(words)

    return positive_words

docx_path = 'D:\internship\positive-words.docx'
positive_words_dict = {'positive': read_positive_words_from_docx(docx_path)}

print("Positive Words:")
pos=(positive_words_dict['positive'])
pos

Positive Words:


{'adoring',
 'unbiased',
 'reformed',
 'gaining',
 'well-behaved',
 'impeccable',
 'idolize',
 'exultingly',
 'cheer',
 'effusive',
 'prettily',
 'thoughtfully',
 'succeeding',
 'fortunate',
 'steadiness',
 'unencumbered',
 'eased',
 'appealing',
 'graciousness',
 'jaw-droping',
 'striving',
 'ideal',
 'nifty',
 'solicitously',
 'gratified',
 'astounded',
 'intriguing',
 'statuesque',
 'poise',
 'novelty',
 'extraordinarily',
 'gladly',
 'freshest',
 'steadfastly',
 'cheapest',
 'aspire',
 'attentive',
 'convincing',
 'cozy',
 'amicable',
 'elite',
 'gratefully',
 'generously',
 'leads',
 'examplary',
 'rockstars',
 'revolutionary',
 'refund',
 'snappy',
 'recovery',
 'achievible',
 'celebrated',
 'durable',
 'overtake',
 'commodious',
 'dauntless',
 'readily',
 'revives',
 'salutary',
 'idol',
 'eloquently',
 'brilliances',
 'openness',
 'fabulous',
 'intrigue',
 'gusto',
 'accurately',
 'courageousness',
 'straighten',
 'acclamation',
 'favorited',
 'effusively',
 'considerate',
 'su

creating document for negeative words

In [79]:


def read_words_from_docx(docx_path):
    words = set()

    doc = Document(docx_path)
    for paragraph in doc.paragraphs:
        words.update(paragraph.text.split())

    return words

negative_docx_path = 'D:\\internship\\negative-words.docx'

negative_words_dict = {'negative': read_words_from_docx(negative_docx_path)}

print("Negative Words:")
neg=(negative_words_dict['negative'])
neg

Negative Words:


{'mispronounces',
 'muscle-flexing',
 'ironies',
 'absence',
 'nervously',
 'distress',
 'conspiracy',
 'less-developed',
 'taint',
 'delaying',
 'unsatisfactory',
 'confusions',
 'starve',
 'hedge',
 'misguide',
 'impertinent',
 'trauma',
 'paralize',
 'hesitant',
 'incomplete',
 'itch',
 'overbearing',
 'unlawfulness',
 'reprove',
 'sorrowful',
 'asinine',
 'attacks',
 'annoyed',
 'measly',
 'sever',
 'debase',
 'calumniously',
 'self-coup',
 'disavowal',
 'inescapable',
 'pompous',
 'worrier',
 'distrust',
 'pig',
 'slower',
 'startling',
 'stodgy',
 'letch',
 'fiend',
 'onerously',
 'avalanche',
 'depravedly',
 'importunate',
 'mortified',
 'rhetoric',
 'shocked',
 'audacious',
 'barbarously',
 'upsettingly',
 'mistrust',
 'discourteous',
 'screw-up',
 'anomalous',
 'insinuate',
 'audaciously',
 'bragger',
 'wripped',
 'regressive',
 'accuse',
 'impractical',
 'recourses',
 'dilly-dally',
 'grating',
 'discontinued',
 'discord',
 'crashing',
 'frazzled',
 'improper',
 'ire',
 'mist

**removing words from positive and negative words which is present in the stop words**

In [80]:
pos -= combine
neg -= combine

print("Updated Positive Words:")
print(pos)

print("\nUpdated Negative Words:")
print(neg)


Updated Positive Words:
{'adoring', 'unbiased', 'reformed', 'gaining', 'well-behaved', 'impeccable', 'idolize', 'exultingly', 'cheer', 'effusive', 'prettily', 'thoughtfully', 'succeeding', 'fortunate', 'steadiness', 'unencumbered', 'eased', 'appealing', 'graciousness', 'jaw-droping', 'striving', 'ideal', 'nifty', 'solicitously', 'gratified', 'astounded', 'intriguing', 'statuesque', 'poise', 'novelty', 'extraordinarily', 'gladly', 'freshest', 'steadfastly', 'cheapest', 'aspire', 'attentive', 'convincing', 'cozy', 'amicable', 'elite', 'gratefully', 'generously', 'leads', 'examplary', 'rockstars', 'revolutionary', 'refund', 'snappy', 'recovery', 'achievible', 'celebrated', 'durable', 'overtake', 'commodious', 'dauntless', 'readily', 'revives', 'salutary', 'idol', 'eloquently', 'brilliances', 'openness', 'fabulous', 'intrigue', 'gusto', 'accurately', 'courageousness', 'straighten', 'acclamation', 'favorited', 'effusively', 'considerate', 'sumptuous', 'trusting', 'invincibility', 'fancier',

**defining separate functions to calculate positive score, negative score, polarity score,subjective score, average sentence length, percentage of complex words, Fog Index, average number of words per sentence, syllables per word, personal pronouns count, average word length,  total words**

In [83]:
import re
import os
import pandas as pd
import math

#positive score
def calculate_positive_score(text, pos):
    tokens = text.lower().split()
    positive_score = sum(1 for token in tokens if token in pos)
    return positive_score

#negative score
def calculate_negative_score(text, neg):
    tokens = text.lower().split()
    negative_score = sum(1 for token in tokens if token in neg)
    return negative_score
#polarity score
def calculate_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

#subjective score
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    return (positive_score + negative_score) / (total_words + 0.000001)

def calculate_readability_scores(text):
    sentences = re.split(r'\.', text)

    #average sentence length
    total_words = len(re.findall(r'\b\w+\b', text))
    average_sentence_length = total_words / len(sentences)
    
    #percentage of complex words
    complex_words = [word for word in re.findall(r'\b\w+\b', text) if len(word) > 6] 
    percentage_complex_words = len(complex_words) / total_words
    
    #Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    
    return average_sentence_length, percentage_complex_words, fog_index

def calculate_additional_scores(text):
    #average number of words per sentence
    total_sentences = len(re.split(r'\.', text))
    total_words = len(re.findall(r'\b\w+\b', text))
    average_words_per_sentence = (total_words / total_sentences) if total_sentences > 0 else 0
    
    #syllables per word
    syllables_per_word = sum(len(re.findall(r'[aeiouy]+', word)) for word in re.findall(r'\b\w+\b', text))
    
    #personal pronouns count
    personal_pronouns = re.findall(r'\b(?:i|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    personal_pronouns_count = len(personal_pronouns)
    
    #average word length
    total_characters = sum(len(word) for word in re.findall(r'\b\w+\b', text))
    average_word_length = total_characters / total_words if total_words > 0 else 0  # Avoid division by zero

    complex_words_count = len([word for word in re.findall(r'\b\w+\b', text) if len(re.findall(r'[aeiouy]+', word)) > 2])
    

    return average_words_per_sentence, syllables_per_word, personal_pronouns_count, average_word_length,total_words,complex_words_count

#root folder
root_folder = "D:/internship/output"

# Read the document with URL ID, URL, Positive Score, and Negative Score columns into a pandas DataFrame
document_path = "D:/internship/Output Data Structure.xlsx"
df = pd.read_excel(document_path)

# Initializing lists to store values
url_ids = []
positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
url_ids_readability = []
average_sentence_lengths = []
percentage_complex_words_list = []
fog_indices = []
url_ids_additional = []
average_words_per_sentence_list = []
syllables_per_word_list = []
personal_pronouns_count_list = []
average_word_length_list = []
complex_words_count_list = []
word_count_list = []

# Iterate through each subfolder and calculate scores for each article
for folder_name in os.listdir(root_folder):
    folder_path = os.path.join(root_folder, folder_name)
    
    #extract numeric part from the folder name
    url_id_match = re.search(r'\d+', folder_name)
    if url_id_match:
        url_id = folder_name
    else:
        continue
    
    for article_name in os.listdir(folder_path):
        article_path = os.path.join(folder_path, article_name)
        
        with open(article_path, 'r', encoding='utf-8') as file:
            article_text = file.read()
            positive_score = calculate_positive_score(article_text, pos)
            negative_score = calculate_negative_score(article_text, neg)
            polarity_score = calculate_polarity_score(positive_score, negative_score)
            total_words = len(re.findall(r'\b\w+\b', article_text))
            subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)
            average_sentence_length, percentage_complex_words, fog_index = calculate_readability_scores(article_text)
            average_words_per_sentence, syllables_per_word, personal_pronouns_count, average_word_length,word_count,complex_words_count = calculate_additional_scores(article_text)
            
            
            # Check for missing scores
            if pd.notna(positive_score) and pd.notna(negative_score):
                # Appending values to the lists
                url_ids.append(url_id)
                positive_scores.append(positive_score)
                negative_scores.append(negative_score)
                polarity_scores.append(polarity_score)
                subjectivity_scores.append(subjectivity_score)
                url_ids_readability.append(url_id)
                average_sentence_lengths.append(average_sentence_length)
                percentage_complex_words_list.append(percentage_complex_words)
                fog_indices.append(fog_index)
                url_ids_additional.append(url_id)
                average_words_per_sentence_list.append(average_words_per_sentence)
                syllables_per_word_list.append(syllables_per_word)
                personal_pronouns_count_list.append(personal_pronouns_count)
                average_word_length_list.append(average_word_length)
                complex_words_count_list.append(complex_words_count)
                word_count_list.append(word_count)


#new DataFrame for all scores from the lists
df_scores = pd.DataFrame({
    'URL_ID': url_ids,
    'POSITIVE SCORE': positive_scores,
    'NEGATIVE SCORE': negative_scores,
    'POLARITY SCORE': polarity_scores,
    'SUBJECTIVITY SCORE': subjectivity_scores,
    'URL_ID': url_ids_readability,
    'AVERAGE SENTENCE LENGTH': average_sentence_lengths,
    'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words_list,
    'FOG INDEX': fog_indices,
    'URL_ID': url_ids_additional,
    'AVERAGE WORDS PER SENTENCE': average_words_per_sentence_list,
    'SYLLABLES PER WORD': syllables_per_word_list,
    'PERSONAL PRONOUNS COUNT': personal_pronouns_count_list,
    'AVERAGE WORD LENGTH': average_word_length_list,
    'COMPLEX WORD COUNT': complex_words_count_list,
    'WORD COUNT': word_count_list
})

print(df_scores)


             URL_ID  POSITIVE SCORE  NEGATIVE SCORE  POLARITY SCORE  \
0   blackassign0001              10               4        0.428571   
1   blackassign0002              51              26        0.324675   
2   blackassign0003              38              23        0.245902   
3   blackassign0004              34              65       -0.313131   
4   blackassign0005              21              10        0.354839   
..              ...             ...             ...             ...   
95  blackassign0096              30              53       -0.277108   
96  blackassign0097              25              35       -0.166667   
97  blackassign0098               4               3        0.142857   
98  blackassign0099              16               5        0.523809   
99  blackassign0100              30              46       -0.210526   

    SUBJECTIVITY SCORE  AVERAGE SENTENCE LENGTH  PERCENTAGE OF COMPLEX WORDS  \
0             0.044728                10.433333                    

In [84]:
df= pd.read_excel("D:\internship\Output Data Structure.xlsx")
df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,,,,,,,,,,,,,
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,,,,,,,,,,,,,
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,,,,,,,,,,,,,
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,,,,,,,,,,,,,
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,,,,,,,,,,,,,
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,,,,,,,,,,,,,


In [85]:
df_scores.columns

Index(['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVERAGE SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVERAGE WORDS PER SENTENCE', 'SYLLABLES PER WORD',
       'PERSONAL PRONOUNS COUNT', 'AVERAGE WORD LENGTH', 'COMPLEX WORD COUNT',
       'WORD COUNT'],
      dtype='object')

In [86]:
df.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [87]:
col = {
    'AVERAGE WORDS PER SENTENCE': 'AVG NUMBER OF WORDS PER SENTENCE',
    'SYLLABLES PER WORD': 'SYLLABLE PER WORD',
    'PERSONAL PRONOUNS COUNT': 'PERSONAL PRONOUNS',
    'AVERAGE WORD LENGTH': 'AVG WORD LENGTH',
    'AVERAGE SENTENCE LENGTH':'AVG SENTENCE LENGTH'
}

df_scores.rename(columns=col, inplace=True)

In [88]:
columns_to_drop = ['POSITIVE SCORE', 'NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE',
                   'COMPLEX WORD COUNT','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH']

df = df.drop(columns=columns_to_drop)

In [89]:
df.columns

Index(['URL_ID', 'URL'], dtype='object')

In [90]:
df_scores.columns

Index(['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'SYLLABLE PER WORD',
       'PERSONAL PRONOUNS', 'AVG WORD LENGTH', 'COMPLEX WORD COUNT',
       'WORD COUNT'],
      dtype='object')

In [91]:
data = pd.merge(df, df_scores, on='URL_ID', how='inner')

In [92]:
data.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'SYLLABLE PER WORD',
       'PERSONAL PRONOUNS', 'AVG WORD LENGTH', 'COMPLEX WORD COUNT',
       'WORD COUNT'],
      dtype='object')

***code to save the data frame into an excel file***

In [94]:
excel_file_path = 'D:\internship\Output Data Structure 2.xlsx'
data.to_excel(excel_file_path, index=False)
print(f"DataFrame has been saved to {excel_file_path}")

DataFrame has been saved to D:\internship\Output Data Structure 2.xlsx
