In [1]:
#importing necessary libraries
import os
import pandas as pd
from textblob import TextBlob
import syllables

In [2]:
# Function to count syllables in a word
def count_syllables(word):
    return syllables.estimate(word)
# Function to perform textual analysis on the article text
def perform_textual_analysis(article_text):
    blob = TextBlob(article_text)
    # Variables to be analyze
    positive_score = blob.sentiment.polarity
    negative_score = -positive_score
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
     # Spliting the text into sentences and words
    sentences = article_text.split('.')
    words = ' '.join(sentences).split()
    #analyzing the  average sentence length, percentage of complex words, and the fog index
    avg_sentence_length = len(words) / len(sentences)
    percentage_complex_words = sum(1 for word in words if count_syllables(word) >= 3) / len(words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    # analyzing word count, syllables, personal pronouns, and average word length
    avg_words_per_sentence = len(words) / len(sentences)
    complex_word_count = sum(1 for word in words if count_syllables(word) >= 3)
    word_count = len(words)
    syllable_per_word = sum(count_syllables(word) for word in words) / word_count
    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself'])
    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0
         #returning all the variables as tuples
    return positive_score, negative_score, polarity_score, subjectivity_score, \
           avg_sentence_length, percentage_complex_words, fog_index, \
           avg_words_per_sentence, complex_word_count, word_count, \
           syllable_per_word, personal_pronouns, avg_word_length


In [3]:
# Reading the output  data structure Excel file
output_structure_file = 'Output Data Structure.xlsx'
output_df = pd.read_excel(output_structure_file)
# Creating a dataframe to store the computed variables
computed_variablesX_df = pd.DataFrame(columns=output_df.columns)
# Reading folder containing extracted texts
extracted_texts_folder = 'output_folder'

In [4]:
#initializing an empty list to store records of computed variables
computed_variables_list = []
# Iterating each row in the output dataframe
for index, row in output_df.iterrows():
    url_id = row['URL_ID']
    # Reading the extracted text from the output folder
    text_file_path = os.path.join(extracted_texts_folder, f'{url_id}.txt')
    if os.path.exists(text_file_path):
        with open(text_file_path, 'r', encoding='utf-8') as file:
            article_text = file.read()
        # Performing textual analysis and storing it in results variable
        results = perform_textual_analysis(article_text)
        # Append the computed variables to the dataframe
        computed_variables_list.append({
            'URL_ID': url_id,
            'URL': row['URL'], 
            **dict(zip(output_df.columns[2:], results))
        })
    else:
        print(f"Text file not found for URL_ID: {url_id}")

In [5]:
# Concatenate the list of computed variables into a DataFrame
computed_variablesX_df = pd.concat([computed_variablesX_df, pd.DataFrame(computed_variables_list)], ignore_index=True)
# Save the computed variables to a new Excel file
output_file = 'Computed_VariablesX_Output.xlsx'
computed_variablesX_df.to_excel(output_file, index=False)
print(f"Textual analysis complete. Computed variables saved to {output_file}")

Textual analysis complete. Computed variables saved to Computed_VariablesX_Output.xlsx


### Note:- 
    - Two functions are defined: count_syllables and perform_textual_analysis for estimates syllables in a word and for sentiment analysis respetively.
    - Then it will return the given variables as tuples.
    -  Then it will read the Excel file to read the structure of the output dataframe.
    - Then I used for loop to iterate through each row in the output dataframe.
    Created a new dataframe called computed_variablesX_df to save the result and then save the df into a new Excel file.
 - How to Run:- Place the script in the same folder as the Output_data_structures.xlsx and the output_folder containing the output text.
    Run the objective-2(data analysis).py script in jupyter notebook or vs code.
 - Install Pandas, Textblob and Syllables. 