In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alasp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Read the Input file
df_Input = pd.read_excel('Input.xlsx')

# Make a Directory to store the extracted text files
article_dir = 'All_Articles'
os.makedirs(article_dir, exist_ok=True)

# Function to extract and save the articles

def Extract_Article(url, url_id):
   
    # Send an HTTP request to the URL and get the webpage content
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Check if the request was successful
    if response.status_code == 200:
        
        # Extract th title
        title = soup.title.text if soup.title else "Title not found"
        
        # Extract main article
        article = soup.article.p.parent
         
        
        if article:
        # Remove footer
            for footer in article.find_all(class_="wp-block-preformatted"):
                footer.extract()

            # Extract the text from article
            main_article = article.get_text(' ',strip=True)
            
    
            # Create a text file with the url_id as the file name
            output_file = os.path.join(article_dir, f'{url_id}.txt') 
        
            # Save the extraxted Title and Text in the text file 
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(title + '.\n\n')
                file.write(main_article)

            print(f'Saved: {output_file}') # Print if article extracted and saved succesfully

        else:
            print('Main article not found on the page.')
            
            
    else:
        print('Failed to retrieve the web page. Status code:', response.status_code)
   

In [4]:
# Iterate over each URL in INPUT file to extract all articles

for index, row in df_Input.iterrows():
    URL_ID = row['URL_ID']
    URL = row['URL']
    Extract_Article(URL, URL_ID)

print('Extraction Completed.')

Saved: All_Articles\123.0.txt
Saved: All_Articles\321.0.txt
Saved: All_Articles\2345.0.txt
Saved: All_Articles\4321.0.txt
Saved: All_Articles\432.0.txt
Saved: All_Articles\2893.8.txt
Saved: All_Articles\3355.6.txt
Saved: All_Articles\3817.4.txt
Saved: All_Articles\4279.2.txt
Saved: All_Articles\4741.0.txt
Saved: All_Articles\5202.8.txt
Saved: All_Articles\5664.6.txt
Saved: All_Articles\6126.4.txt
Saved: All_Articles\6588.2.txt
Saved: All_Articles\7050.0.txt
Saved: All_Articles\7511.8.txt
Saved: All_Articles\7973.6.txt
Saved: All_Articles\8435.4.txt
Saved: All_Articles\8897.2.txt
Saved: All_Articles\9359.0.txt
Saved: All_Articles\9820.8.txt
Saved: All_Articles\10282.6.txt
Saved: All_Articles\10744.4.txt
Saved: All_Articles\11206.2.txt
Failed to retrieve the web page. Status code: 404
Saved: All_Articles\12129.8.txt
Saved: All_Articles\12591.6.txt
Saved: All_Articles\13053.4.txt
Saved: All_Articles\13515.2.txt
Saved: All_Articles\13977.0.txt
Saved: All_Articles\14438.8.txt
Saved: All_Art

In [5]:
# Get all the stop word files in a List 

all_files = os.listdir('StopWords')

#[file for file in all_files if file.endswith('.txt')]

stop_word_files = [file for file in all_files if re.search(r'StopWords_\w+.txt', file)]

# Collect all stop words from all stop word Text files in a set.

# Some text files contains extra info after pipe(|) operator which are not stop words.

# So, collect the words before pipe(|) operator only.

stop_words = set()

for stop_file in stop_word_files:
    with open('StopWords/' + stop_file, 'r') as file:
        for line in file:
            word = line.split('|')[0].strip()  # 
            stop_words.add(word)
        
stop_words_upper = {word.upper() for word in stop_words}


# Extract and collect all positive words ina set.

positive_words = set()
with open('MasterDictionary/positive-words.txt', 'r') as file:
    for line in file:
        word = line.strip()
        if word:                       # Check if line is not empty
            positive_words.add(word)

# Extract and collect all negative words in a set.

negative_words = set()
with open('MasterDictionary/negative-words.txt', 'r') as file:
    for line in file:
        word = line.strip()
        if word:                       # Check if line is not empty
            negative_words.add(word)

In [6]:
# Main function to all assignment operations

def main(file, df):
    
    # Read the text file
    with open('All_Articles/' + file, 'r', encoding='utf-8') as file:
        
        main_text = file.read()
        
        ID = re.findall(r'(\d+.\d*)(?:.txt)', str(file)) # Extract ID from file name
        
        ID = float(ID[0])  # Convert ID into float

        words = nltk.word_tokenize(main_text)  # Tokenize the text

        cleaned_text = []
        
        # Remove stop words 
        for word in words:
            if word.upper() not in stop_words_upper:
                cleaned_text.append(word)

        # Sentimental Analysis

        positive_score = 0
        negative_score = 0
        
        # Get total count of positive and negative words in the article
        for word in cleaned_text:
            if word in positive_words:
                positive_score = positive_score + 1
            elif word in negative_words:
                negative_score = negative_score + 1
        
        # Polarity Score
        polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)

        total_words = len(cleaned_text)
        
        # Subjectivity Score
        subjectivity_score = (positive_score + negative_score)/ ((total_words) + 0.000001)



        # Analysis of Readability

        # 1. Average Sentence Length 
        
        def remove_punctuations(main_text):

            words = nltk.word_tokenize(main_text) # Get all tokens from text

            cleaned_words = []
            
            # Remove Punctuations
            for word in words:
                if word not in string.punctuation:
                     cleaned_words.append(word)

            return cleaned_words

        words_without_punctuations = remove_punctuations(main_text)

        sentences = nltk.sent_tokenize(main_text)

        total_sentences = len(sentences)

        total_words_without_punctuations = len(words_without_punctuations)

        average_sentence_length = total_words_without_punctuations / total_sentences


        # 2. Percentage of Complex words 

        # Function to count syllables in the text
        def syllable_count(word):

            vowels = 'aeiou'

            count = 0

            for char in word:
                if char.lower() in vowels:
                    count = count + 1


            if word.lower().endswith("es") or word.lower().endswith("ed"):
                count -= 1

            return count

        # Function to count total complex words in the text
        def complex_word_count(main_text):

            words_C = nltk.word_tokenize(main_text)

            total_complex_words = 0

            for word in words_C:
                if syllable_count(word) > 2:
                    total_complex_words = total_complex_words + 1

            return total_complex_words

        total_complex_words = complex_word_count(main_text)

        percentage_of_complex_words = (total_complex_words / total_words_without_punctuations)* 100

        # 3. Fog Index

        Fog_Index = 0.4 * (average_sentence_length + percentage_of_complex_words)

        # Average Number of Words Per Sentence

        average_sentence_length

        # Complex Word Count

        total_complex_words

        # Word Count
        
        # To get word count remove NLTK stop words and punctuatins
        def cleaned_words(main_text):

            words = nltk.word_tokenize(main_text)

            nltk_stop_words = set(stopwords.words('english'))

            cleaned_words = []

            for word in words:
                if word.lower() not in nltk_stop_words and word not in string.punctuation:
                     cleaned_words.append(word)

            return len(cleaned_words)

        word_count = cleaned_words(main_text)   

        # Syllable Count Per Word

        total_syllables = 0
        
        # Call syllable_count function to get total syllable count per word
        for word in words_without_punctuations:
            count = syllable_count(word)
            total_syllables = total_syllables + count

        syllable_count_per_word = total_syllables / total_words_without_punctuations

        # Personal Pronouns
        
        # Function to count personal pronouns using REGEX
        def count_personal_pronouns(text):

            pattern = r'\s(I|we|my|ours|us)\s'

            matches = re.findall(pattern, text, flags=re.IGNORECASE)

            count = 0

            for match in matches:
                if match == 'US':
                    continue  # Skip the "US" match
                count = count +  1

            return count

        total_personal_pronouns = count_personal_pronouns(main_text)

        # Average Word Length

        total_char = 0

        for word in words_without_punctuations:
            char = len(word)
            total_char = total_char + char

        average_word_length = total_char/len(words_without_punctuations)  

        # Read the Output Data Structure file
        #df = pd.read_excel('Output.xlsx')
        
        # Update the Output Data Structure file with our results
        
        df.loc[df['URL_ID'] == ID, 'POSITIVE SCORE'] = positive_score
        df.loc[df['URL_ID'] == ID, 'NEGATIVE SCORE'] = negative_score
        df.loc[df['URL_ID'] == ID, 'POLARITY SCORE'] = polarity_score
        df.loc[df['URL_ID'] == ID, 'SUBJECTIVITY SCORE'] = subjectivity_score
        df.loc[df['URL_ID'] == ID, 'AVG SENTENCE LENGTH'] = average_sentence_length
        df.loc[df['URL_ID'] == ID, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_of_complex_words
        df.loc[df['URL_ID'] == ID, 'FOG INDEX'] = Fog_Index
        df.loc[df['URL_ID'] == ID, 'AVG NUMBER OF WORDS PER SENTENCE'] = average_sentence_length
        df.loc[df['URL_ID'] == ID, 'COMPLEX WORD COUNT'] = total_complex_words
        df.loc[df['URL_ID'] == ID, 'WORD COUNT'] = word_count
        df.loc[df['URL_ID'] == ID, 'SYLLABLE PER WORD'] = syllable_count_per_word
        df.loc[df['URL_ID'] == ID, 'PERSONAL PRONOUNS'] = total_personal_pronouns
        df.loc[df['URL_ID'] == ID, 'AVG WORD LENGTH'] = average_word_length


In [7]:
# Access this Directory 
all_docs = os.listdir('All_Articles')

# Collect all article text files in a list
docs = []
for file in all_docs:
    if file.endswith('.txt'):
        docs.append(file)

# Read the Output Data Structure file
df = pd.read_excel('Output Data Structure.xlsx')

# Iterate over each article for analysis
for file in docs:
    main(file, df)



In [8]:
# 2 URLs are not working so replace their columns with -1.

df.fillna(-1, inplace=True)

# This Excel file must be closed before updating it.
df.to_excel('Output Data Structure.xlsx', index=False)    
    
print('Analysis Completed')    

Analysis Completed
