In [None]:
from google.colab import drive
drive.mount('/content/drive')

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

import re

dp = "/content/drive/MyDrive/project/"
df = pd.read_excel(dp + "Input.xlsx")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
base_dir = "/gdrive/MyDrive/project"
test_folder = os.path.join(base_dir, "test")

# Create the test folder if it doesn't exist
if not os.path.exists(test_folder):
    os.makedirs(test_folder)

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    url = row.get('URL')
    url_id = row.get('URL_ID')

    if not url or not url_id:
        print("Missing URL or URL_ID for index:", index)
        continue

    # Make a request to the URL
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # raise exception for bad status codes
    except requests.exceptions.RequestException as e:
        print("Error fetching URL:", e)
        continue

    # Create a BeautifulSoup object
    try:
        soup = BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print("Error creating BeautifulSoup object:", e)
        continue

    # Find title
    title = soup.find('h1')
    if not title:
        print("Title not found for URL_ID:", url_id)
        continue
    title_text = title.get_text()

    # Find text
    article = ""
    for p in soup.find_all('p'):
        article += p.get_text()

    # Write title and text to the file
    file_name = os.path.join(test_folder, '{}.txt'.format(url_id))
    try:
        with open(file_name, 'w') as file:
            file.write(title_text + '\n' + article)
        print("File saved:", file_name)  # Add logging for successful file save
    except Exception as e:
        print("Error writing to file:", e)

File saved: /gdrive/MyDrive/project/test/blackassign0001.txt
File saved: /gdrive/MyDrive/project/test/blackassign0002.txt
File saved: /gdrive/MyDrive/project/test/blackassign0003.txt
File saved: /gdrive/MyDrive/project/test/blackassign0004.txt
File saved: /gdrive/MyDrive/project/test/blackassign0005.txt
File saved: /gdrive/MyDrive/project/test/blackassign0006.txt
File saved: /gdrive/MyDrive/project/test/blackassign0007.txt
File saved: /gdrive/MyDrive/project/test/blackassign0008.txt
File saved: /gdrive/MyDrive/project/test/blackassign0009.txt
File saved: /gdrive/MyDrive/project/test/blackassign0010.txt
File saved: /gdrive/MyDrive/project/test/blackassign0011.txt
File saved: /gdrive/MyDrive/project/test/blackassign0012.txt
File saved: /gdrive/MyDrive/project/test/blackassign0013.txt
File saved: /gdrive/MyDrive/project/test/blackassign0014.txt
File saved: /gdrive/MyDrive/project/test/blackassign0015.txt
File saved: /gdrive/MyDrive/project/test/blackassign0016.txt
File saved: /gdrive/MyDr

In [None]:
# Directories
base_dir = "/content/drive/MyDrive/project/stopwords/"
text_dir = "/content/drive/MyDrive/project/test/"
stopwords_dir = "/content/drive/MyDrive/project/stopwords/combined_text.txt"

# Common encodings to try
encodings = ['utf-8', 'latin-1', 'ISO-8859-1']

# List to store content of each text file
all_text = []

# Load text files from base directory
for file_name in os.listdir(base_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(base_dir, file_name)
        text = None
        # Attempt to open the file with different encodings
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    text = file.read()
                break  # Stop trying other encodings if successful
            except UnicodeDecodeError:
                print(f"Failed to decode {file_path} with encoding {encoding}")
        if text is not None:
            all_text.append(text)

# Combine text from all files into a single string
combined_text = '\n'.join(all_text)

# Save combined text to a new file
combined_file_path = os.path.join(base_dir, "combined_text.txt")
with open(combined_file_path, 'w') as combined_file:
    combined_file.write(combined_text)

# Load stop words from the stopwords directory and store in the set variable
stop_words = set()
with open(stopwords_dir, 'r', encoding='ISO-8859-1') as f:
    stop_words.update(set(f.read().splitlines()))

# Load all text files from the text directory and store in a list (docs)
docs = []
for text_file in os.listdir(text_dir):
    with open(os.path.join(text_dir, text_file), 'r') as f:
        text = f.read()
        # Tokenize the text file
        words = word_tokenize(text)
        # Remove stop words from the tokens
        filtered_text = [word for word in words if word.lower() not in stop_words]
        # Add filtered tokens of each file into a list
        docs.append(filtered_text)

Failed to decode /content/drive/MyDrive/project/stopwords/StopWords_Currencies.txt with encoding utf-8


In [None]:
# Store positive and negative words from the directory
sentment_dir = "/content/drive/MyDrive/project/MasterDictionary/"
pos = set()
neg = set()

for filename in os.listdir(sentment_dir):
    with open(os.path.join(sentment_dir, filename), 'r', encoding='ISO-8859-1') as f:
        words = f.read().splitlines()
        if filename == 'positive-words.txt':
            pos.update(words)
        else:
            neg.update(words)

# Collect positive and negative words from each file and calculate scores
positive_words = []
negative_words = []
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []

# Iterate through the list of docs
for doc in docs:
    positive_words.append([word for word in doc if word.lower() in pos])
    negative_words.append([word for word in doc if word.lower() in neg])
    positive_score.append(len(positive_words[-1]))
    negative_score.append(len(negative_words[-1]))
    denominator = len(doc) + 0.000001  # Small value added to avoid division by zero
    polarity_score.append((positive_score[-1] - negative_score[-1]) / denominator)
    subjectivity_score.append((positive_score[-1] + negative_score[-1]) / denominator)

stopwords = set(stopwords.words('english'))

In [None]:
def measure(file):
    with open(os.path.join(text_dir, file), 'r') as f:
        text = f.read()
        text = re.sub(r'[^\w\s.]', '', text)
        sentences = text.split('.')
        num_sentences = len(sentences)
        words = [word for word in text.split() if word.lower() not in stopwords]
        num_words = len(words)

        complex_words = [word for word in words if len([letter for letter in word if letter.lower() in 'aeiou']) > 2]

        syllable_count = 0
        syllable_words = []
        for word in words:
            if word.endswith('es'):
                word = word[:-2]
            elif word.endswith('ed'):
                word = word[:-2]
            syllable_count_word = sum(1 for letter in word if letter.lower() in 'aeiou')
            if syllable_count_word >= 1:
                syllable_words.append(word)
                syllable_count += syllable_count_word

        avg_sentence_len = num_words / num_sentences
        avg_syllable_word_count = syllable_count / len(syllable_words)
        percent_complex_words = len(complex_words) / num_words
        fog_index = 0.4 * (avg_sentence_len + percent_complex_words)

        return avg_sentence_len, percent_complex_words, fog_index, len(complex_words), avg_syllable_word_count

avg_sentence_length = []
percentage_of_complex_words = []
fog_index = []
complex_word_count = []
avg_syllable_word_count = []

for file in os.listdir(text_dir):
    x, y, z, a, b = measure(file)
    avg_sentence_length.append(x)
    percentage_of_complex_words.append(y)
    fog_index.append(z)
    complex_word_count.append(a)
    avg_syllable_word_count.append(b)

def cleaned_words(file):
    with open(os.path.join(text_dir, file), 'r') as f:
        text = f.read()
        text = re.sub(r'[^\w\s]', '', text)
        words = [word for word in text.split() if word.lower() not in stopwords]
        length = sum(len(word) for word in words)
        average_word_length = length / len(words)
    return len(words), average_word_length

word_count = []
average_word_length = []

for file in os.listdir(text_dir):
    x, y = cleaned_words(file)
    word_count.append(x)
    average_word_length.append(y)

def count_personal_pronouns(file):
    with open(os.path.join(text_dir, file), 'r') as f:
        text = f.read()
        personal_pronouns = ["I", "we", "my", "ours", "us"]
        count = 0
        for pronoun in personal_pronouns:
            count += len(re.findall(r"\b" + pronoun + r"\b", text))
    return count

pp_count = []

for file in os.listdir(text_dir):
    x = count_personal_pronouns(file)
    pp_count.append(x)

output_df = pd.read_excel(dp + 'Output Data Structure.xlsx')
indices_to_drop = [44-37, 57-37, 144-37]

# Check if the indices exist in the DataFrame before dropping
indices_to_drop = [idx for idx in indices_to_drop if idx in output_df.index]

# Drop the rows from the DataFrame
if indices_to_drop:
    output_df.drop(indices_to_drop, axis=0, inplace=True)

variables = [positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
             percentage_of_complex_words, fog_index, avg_sentence_length, complex_word_count, word_count,
             avg_syllable_word_count, pp_count, average_word_length]

for i, var in enumerate(variables):
    column_index = i + 2  # Adjust column index based on your DataFrame structure
    if len(var) != len(output_df):
        # Handle the length mismatch silently
        pass
    else:
        output_df.iloc[:, column_index] = var

output_df.to_csv('Output_Data.csv')