**Step 1: Importing Libraries**


In [None]:
import nltk
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
import csv
import re

**Step 2: Read URLs from Input.xlsx**

In [None]:

with open("/content/Input.xlsx - Sheet1.csv", newline="") as file:
  r = csv.reader(file)
  urls = []
  next(r)
  for col in r:
    urls.append(col[1])

#find total number of urls
n_urls = len(urls)
print(f"Number of URLS : ",n_urls)

#print the urls
for url in urls:
  print(urls)


Number of URLS :  100
['https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/', 'https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/', 'https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/', 'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/', 'https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/', 'https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/', 'https://insights.blackcoffer.com/rise-of-cyber-crime-and-its-effects/', 'https://insights.blackcoffer.com/rise-of-internet-demand-and-its-impact-on-communications-and-alternatives-by-the-year-2035-2/', 'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-ef

**Step 3 : Fetch and Parse Article Titles and Content from URLs**


In [None]:
def fetch_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        parse = BeautifulSoup(response.content, 'html.parser')

        article_title = parse.find('title').get_text() if parse.find('title') else 'No title found'
        article_content = parse.find('div', class_='td-post-content')
        if article_content:
            paragraphs = article_content.find_all('p')
            article_text = ' '.join([p.get_text() for p in paragraphs])
            return article_title, article_text
        else:
            print(f"No article content found for {url}")
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
        return 'Error fetching the URL', str(e)



In [None]:
def clean_text(text):
    text = re.sub(r'\xa0', ' ', text)
    return text

def file(urls, filename):
    titles = []
    contents = []

    for url in urls:
        article_title, article_content = fetch_article(url)
        if article_title and article_content:
            cleaned_title = clean_text(article_title)
            cleaned_content = clean_text(article_content)
            titles.append(cleaned_title)
            contents.append(cleaned_content)

    with open(filename, 'w', encoding='utf-8') as file:
        for index, (title, content) in enumerate(zip(titles, contents), start=1):
            file.write(f"{index}. {title}\n\n")
            file.write(f"Content: {content}\n\n")


file(urls, 'articles.txt')




Error fetching the URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error fetching the URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


**Step 5: Sentiment Analysis**

**Removing Stop words**

In [None]:
import os
import nltk

stop_words = "/content/Stop words"
sw = set()


for filename in os.listdir(stop_words):
    if filename.endswith('.txt'):
        filepath = os.path.join(stop_words, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                for line in file:
                    sw.add(line.strip().lower())
        except UnicodeDecodeError:
            with open(filepath, 'r', encoding='latin-1') as file:
                for line in file:
                    sw.add(line.strip().lower())






In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

articles_file = "/content/articles.txt"
output_file = "/content/filtered_articles.txt"

with open(articles_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

with open(output_file, 'w', encoding='utf-8') as file:
    total_filtered_words = 0
    for line in lines:
        words = word_tokenize(line)
        filtered_words = [word for word in words if word.lower() not in sw]
        file.write(' '.join(filtered_words) + '\n')
        total_filtered_words += len(filtered_words)

print("Total count of words after removing stop words and writing to the new file:", total_filtered_words)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total count of words after removing stop words and writing to the new file: 60724


**Step 6: Filtering out Positive and Negative Words**

In [None]:
positive_words_file = "/content/Master Dictionary/positive-words.txt"
negative_words_file = "/content/Master Dictionary/negative-words.txt"


encodings = ['utf-8', 'latin-1']

def count_words(file_path):
    num_words = None
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                words = file.read().split()
                num_words = len(words)
            print("Number of words in", file_path, ":", num_words)
            break
        except UnicodeDecodeError:
            continue

    if num_words is None:
        print("Unable to determine the number of words in", file_path, ". All encodings failed.")

count_words(positive_words_file)
count_words(negative_words_file)



Number of words in /content/Master Dictionary/positive-words.txt : 2006
Number of words in /content/Master Dictionary/negative-words.txt : 4783


In [None]:
import os

master_dict= "/content/Master Dictionary"
pw_file = os.path.join(master_dict, "positive-words.txt")
nw_file = os.path.join(master_dict, "negative-words.txt")

positive_words = set()
negative_words = set()

def filter_words(file_path, stop_words):
    words = set()
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                word = line.strip().lower()
                if word not in stop_words:
                    words.add(word)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            for line in file:
                word = line.strip().lower()
                if word not in stop_words:
                    words.add(word)
    return words


def save_filtered_words(words, file_path, stop_words):
    with open(file_path, 'w', encoding='utf-8') as file:
        for word in sorted(words):
            if word not in stop_words:
                file.write(word + "\n")

positive_words = filter_words(pw_file, sw)
negative_words = filter_words(nw_file, sw)

save_filtered_words(positive_words, pw_file, sw)
save_filtered_words(negative_words, nw_file, sw)




In [None]:
def filteredwords_count(words):
    return len(words)


total_positive_words = filteredwords_count(positive_words)
total_negative_words = filteredwords_count(negative_words)

print("Total number of words in filtered Positive Words:", total_positive_words)
print("Total number of words in filtered Negative Words:", total_negative_words)


Total number of words in filtered Positive Words: 1907
Total number of words in filtered Negative Words: 4693


**Sentiment Scores**

In [None]:
import openpyxl
from openpyxl import load_workbook
from textblob import TextBlob

def process_articles(articles_file, positive_words_file, negative_words_file, output_file):
    with open(positive_words_file, 'r', encoding='utf-8') as file:
        positive_words = [line.strip() for line in file]

    with open(negative_words_file, 'r', encoding='utf-8') as file:
        negative_words = [line.strip() for line in file]

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending sentiment scores to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Sentiment Scores'
        sheet['C1'] = 'Positive Score'
        sheet['D1'] = 'Negative Score'
        sheet['E1'] = 'Polarity Score'
        sheet['F1'] = 'Subjectivity Score'
        print(f'Creating new file: {output_file}')

    with open(articles_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    titles = []
    contents = []
    index = 1

    for line in lines:
        if line.startswith(f"{index}. "):
            title = line.strip()
            content = ''
        elif line.startswith("Content: "):
            content = line[len("Content: "):].strip()
            titles.append(title)
            contents.append(content)
            index += 1

    for row_idx, (title, content) in enumerate(zip(titles, contents), start=2):
        blob = TextBlob(content)
        words = blob.words
        cleaned_words = [word for word in words if word.isalpha()]

        positive_score = sum(1 for word in cleaned_words if word in positive_words)
        negative_score = sum(-1 for word in cleaned_words if word in negative_words) * -1
        polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

        sheet[f'C{row_idx}'] = positive_score
        sheet[f'D{row_idx}'] = negative_score
        sheet[f'E{row_idx}'] = polarity_score
        sheet[f'F{row_idx}'] = subjectivity_score

    workbook.save(output_file)
    print(f'Sentiment scores have been saved to {output_file}')


articles_file = "/content/articles.txt"
positive_words_file = "/content/Master Dictionary/positive-words.txt"
negative_words_file = "/content/Master Dictionary/negative-words.txt"
output_file = "/content/Output Data Structure (4).xlsx"


process_articles(articles_file, positive_words_file, negative_words_file, output_file)



Appending sentiment scores to existing file: /content/Output Data Structure (4).xlsx
Sentiment scores have been saved to /content/Output Data Structure (4).xlsx


**Step 7 : Readability Analysis**


**Average Sentence Length**

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import openpyxl
from openpyxl import load_workbook
import nltk

nltk.download('punkt')

def calculate_average_sentence_length(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)

    words = word_tokenize(text)
    num_words = len(words)

    if num_sentences > 0:
        average_sentence_length = num_words / num_sentences
    else:
        average_sentence_length = 0

    return average_sentence_length

def process_articles_and_save_avg_length(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    print(f"Number of articles found: {len(articles)}")

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending average sentence length to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['G1'] = 'Average Sentence Length'
        print(f'Creating new file: {output_file}')

    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            avg_sentence_length = calculate_average_sentence_length(article)
            sheet.cell(row=start_row, column=7, value=avg_sentence_length)
            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Average sentence lengths have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_avg_length = "/content/Output Data Structure (4) (2).xlsx"

process_articles_and_save_avg_length(articles_file, output_file_avg_length)


Number of articles found: 201
Appending average sentence length to existing file: /content/Output Data Structure (4) (2).xlsx
Average sentence lengths have been saved to /content/Output Data Structure (4) (2).xlsx


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Complex Words**

In [None]:
from nltk.tokenize import word_tokenize
import openpyxl
from openpyxl import load_workbook
import nltk

nltk.download('punkt')

def calculate_percentage_complex_words(text):
    words = word_tokenize(text)
    num_words = len(words)

    cmudict = nltk.corpus.cmudict.dict()
    complex_words = [word for word in words if len(cmudict.get(word.lower(), [])) > 2]
    num_complex_words = len(complex_words)

    if num_words > 0:
        percentage_complex_words = (num_complex_words / num_words) * 100
    else:
        percentage_complex_words = 0

    return percentage_complex_words

def process_articles_and_save_complex_words(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')


    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending complex word percentages to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Complex Word Percentages'
        sheet['H1'] = 'Percentage of Complex Words'
        print(f'Creating new file: {output_file}')


    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            percentage_complex_words = calculate_percentage_complex_words(article)

            sheet.cell(row=start_row, column=8, value=percentage_complex_words)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Complex word percentages have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_complex_words = "/content/Output Data Structure (4) (2).xlsx"

process_articles_and_save_complex_words(articles_file, output_file_complex_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Appending complex word percentages to existing file: /content/Output Data Structure (4) (2).xlsx
Complex word percentages have been saved to /content/Output Data Structure (4) (2).xlsx


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import openpyxl
from openpyxl import load_workbook
import nltk

nltk.download('punkt')

def calculate_average_words_per_sentence(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)

    words = word_tokenize(text)
    num_words = len(words)

    if num_sentences > 0:
        average_words_per_sentence = num_words / num_sentences
    else:
        average_words_per_sentence = 0

    return average_words_per_sentence

def process_articles_and_save_avg_length(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')


    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending average words per sentence to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['J1'] = 'Average Words Per Sentence'
        print(f'Creating new file: {output_file}')


    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            avg_words_per_sentence = calculate_average_words_per_sentence(article)

            sheet.cell(row=start_row, column=10, value=avg_words_per_sentence)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Average words per sentence have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_avg_length = "/content/Output Data Structure (4) (2).xlsx"

process_articles_and_save_avg_length(articles_file, output_file_avg_length)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Appending average words per sentence to existing file: /content/Output Data Structure (4) (2).xlsx
Average words per sentence have been saved to /content/Output Data Structure (4) (2).xlsx


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import openpyxl
from openpyxl import load_workbook
import nltk

nltk.download('punkt')

def calculate_metrics(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)

    words = word_tokenize(text)
    num_words = len(words)

    if num_sentences > 0:
        average_sentence_length = num_words / num_sentences
    else:
        average_sentence_length = 0

    cmudict = nltk.corpus.cmudict.dict()
    complex_words = [word for word in words if len(cmudict.get(word.lower(), [])) > 2]
    num_complex_words = len(complex_words)
    percentage_complex_words = (num_complex_words / num_words) * 100

    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    return fog_index

def process_articles_and_save_fog_index(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    print(f"Number of articles found: {len(articles)}")

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending Fog Index to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['I1'] = 'Fog Index'
        print(f'Creating new file: {output_file}')


    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            fog_index = calculate_metrics(article)

            sheet.cell(row=start_row, column=9, value=fog_index)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Fog Index values have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_fog_index = "/content/Output Data Structure (4) (2).xlsx"

process_articles_and_save_fog_index(articles_file, output_file_fog_index)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Number of articles found: 201
Appending Fog Index to existing file: /content/Output Data Structure (4) (2).xlsx
Fog Index values have been saved to /content/Output Data Structure (4) (2).xlsx


In [None]:
from nltk.tokenize import word_tokenize
import openpyxl
from openpyxl import load_workbook
import nltk
from nltk.corpus import cmudict

nltk.download('punkt')
nltk.download('cmudict')
cmu_dict = cmudict.dict()

def count_complex_words(text):
    words = word_tokenize(text)
    num_complex_words = 0

    for word in words:
        if word.lower() in cmu_dict and max(len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word.lower()]) > 2:
            num_complex_words += 1

    return num_complex_words

def process_articles_and_save_complex_word_count(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending Complex Word Count to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['G1'] = 'Complex Word Count'
        print(f'Creating new file: {output_file}')

    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            complex_word_count = count_complex_words(article)

            sheet.cell(row=start_row, column=11, value=complex_word_count)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Complex Word Counts have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "articles.txt"
output_file_complex_word_count = "Output Data Structure - Complex Word Count.xlsx"

process_articles_and_save_complex_word_count(articles_file, output_file_complex_word_count)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


{'Complex words count': 13506}


In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

import openpyxl
from openpyxl import load_workbook

import nltk
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_and_count_words(text):

    words = word_tokenize(text)

    cleaned_words = []
    for word in words:

        word = word.strip(string.punctuation)


        if word and word.lower() not in stop_words:
            cleaned_words.append(word)


    return len(cleaned_words)

def process_articles_and_save_cleaned_word_count(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending Cleaned Word Count to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['L1'] = 'Cleaned Word Count'
        print(f'Creating new file: {output_file}')

    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            cleaned_word_count = clean_and_count_words(article)

            sheet.cell(row=start_row, column=12, value=cleaned_word_count)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Cleaned Word Counts have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_cleaned_word_count = "/content/Output Data Structure (4) (2).xlsx"

process_articles_and_save_cleaned_word_count(articles_file, output_file_cleaned_word_count)





Appending Cleaned Word Count to existing file: /content/Output Data Structure (4) (2).xlsx
Cleaned Word Counts have been saved to /content/Output Data Structure (4) (2).xlsx


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk.tokenize import word_tokenize
import openpyxl
from openpyxl import load_workbook
import nltk

nltk.download('punkt')

def count_syllables(word):
    vowels = "aeiouy"
    count = 0
    last_char = ''

    for char in word:
        char = char.lower()
        if char in vowels and last_char not in vowels:
            count += 1
        last_char = char


    if word.endswith("es") or word.endswith("ed"):
        count -= 1

    if word.endswith("e") and not word.endswith("le"):
        count -= 1
    if count == 0:
        count = 1

    return count

def count_total_syllables(text):
    words = word_tokenize(text)
    total_syllables = sum(count_syllables(word) for word in words)
    return total_syllables

def process_articles_and_save_syllable_count(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending Syllable Count to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['M1'] = 'Syllable Count'
        print(f'Creating new file: {output_file}')

    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            syllable_count = count_total_syllables(article)

            sheet.cell(row=start_row, column=13, value=syllable_count)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Syllable Counts have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_syllable_count = "/content/Output Data Structure (4) (2).xlsx"

process_articles_and_save_syllable_count(articles_file, output_file_syllable_count)


Appending Syllable Count to existing file: /content/Output Data Structure (4) (2).xlsx
Syllable Counts have been saved to /content/Output Data Structure (4) (2).xlsx


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import re
import openpyxl
from openpyxl import load_workbook

def count_personal_pronouns(text):

    pronouns = ["I", "we", "my", "ours", "us"]


    pattern = r'\b(?:{})\b'.format('|'.join(pronouns))


    regex = re.compile(pattern, re.IGNORECASE)


    matches = regex.findall(text)


    return len(matches)

def process_articles_and_save_pronoun_count(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending Personal Pronoun Counts to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['N1'] = 'Personal Pronoun Count'
        print(f'Creating new file: {output_file}')

    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            pronoun_count = count_personal_pronouns(article)

            sheet.cell(row=start_row, column=14, value=pronoun_count)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Personal Pronoun Counts have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_pronoun_count = "/content/Output Data Structure (4) (2) (18).xlsx"

process_articles_and_save_pronoun_count(articles_file, output_file_pronoun_count)






Appending Personal Pronoun Counts to existing file: /content/Output Data Structure (4) (2) (18).xlsx
Personal Pronoun Counts have been saved to /content/Output Data Structure (4) (2) (18).xlsx


In [None]:
from nltk.tokenize import word_tokenize
import openpyxl
from openpyxl import load_workbook

def calculate_average_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)

    if total_words > 0:
        average_word_length = total_characters / total_words
    else:
        average_word_length = 0

    return average_word_length

def process_articles_and_save_avg_word_length(articles_file, output_file):
    with open(articles_file, 'r', encoding='utf-8') as file:
        content = file.read()
        articles = content.split('\n\n')

    try:
        workbook = load_workbook(output_file)
        sheet = workbook.active
        print(f'Appending Average Word Length to existing file: {output_file}')
    except FileNotFoundError:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = 'Readability Metrics'
        sheet['O1'] = 'Average Word Length'
        print(f'Creating new file: {output_file}')

    start_row = 2
    current_index = 1

    for article in articles:
        article = article.strip()
        if article.startswith(f"{current_index}."):
            avg_word_length = calculate_average_word_length(article)

            sheet.cell(row=start_row, column=15, value=avg_word_length)

            start_row += 1
            current_index += 1

    try:
        workbook.save(output_file)
        print(f'Average Word Lengths have been saved to {output_file}')
    except PermissionError as e:
        print(f'Error: Could not save file {output_file}. Permission denied.')
        print(e)

articles_file = "/content/articles.txt"
output_file_avg_word_length = "/content/Output Data Structure (4) (2) (18).xlsx"

process_articles_and_save_avg_word_length(articles_file, output_file_avg_word_length)


Appending Average Word Length to existing file: /content/Output Data Structure (4) (2) (18).xlsx
Average Word Lengths have been saved to /content/Output Data Structure (4) (2) (18).xlsx
