In [39]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import syllable

In [40]:
# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
!pip install syllapy

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [60]:
import syllapy

In [61]:
# Function to extract article text from URL
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    article = soup.find('article')
    if article:
        title = article.find('h1').get_text().strip()
        text = ' '.join([p.get_text().strip() for p in article.find_all('p')])
        return title, text
    else:
        return None, None


# Function to calculate variables
def calculate_variables(text):
    # Tokenization
    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    # Stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # POS Tagging
    pos_tags = nltk.pos_tag(filtered_words)
    personal_pronouns = sum(1 for word, pos in pos_tags if pos == 'PRP')

    # Complexity
    complex_words = [word for word in filtered_words if len(word) > 2 and len(set(word)) > 2]
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)
    percentage_complex_words = (len(complex_words) / len(filtered_words)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Syllable count using syllapy library
    total_syllables = sum(syllapy.count(word) for word in filtered_words)

    # Variables
    word_count = len(filtered_words)
    avg_word_length = sum(len(word) for word in filtered_words) / word_count
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

    return personal_pronouns, avg_sentence_length, percentage_complex_words, fog_index, word_count, total_syllables, avg_word_length



In [62]:
# Read input file
input_data = pd.read_excel('inputb.xlsx')


In [64]:
# Iterate over each URL
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, text = extract_text_from_url(url)
    if text:
        personal_pronouns, avg_sentence_length, percentage_complex_words, fog_index, word_count, syllable_count, avg_word_length = calculate_variables(text)
        output_data.append([url_id, title, personal_pronouns, avg_sentence_length, percentage_complex_words, fog_index, word_count, syllable_count, avg_word_length])


In [65]:
# Create DataFrame for output data
output_df = pd.DataFrame(output_data, columns=['URL_ID', 'Title', 'Personal_Pronouns', 'Avg_Sentence_Length', 'Percentage_Complex_Words', 'Fog_Index', 'Word_Count', 'Syllable_Count', 'Avg_Word_Length'])


In [66]:
# Write output DataFrame to Excel file
output_df.to_excel('output.xlsx', index=False)

In [67]:
import pandas as pd

# Read the output data from the Excel file
output_data = pd.read_excel('output.xlsx')

# Display the output data
output_data


Unnamed: 0,URL_ID,Title,Personal_Pronouns,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Word_Count,Syllable_Count,Avg_Word_Length
0,blackassign0001,Rising IT cities and its impact on the economy...,0,15.541667,98.863636,45.762121,176,362,6.221591
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,3,20.909091,98.671498,47.832235,828,1977,7.143720
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...",1,21.535714,99.534884,48.428239,645,1760,8.009302
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,0,23.568627,99.544073,49.245080,658,1684,7.767477
4,blackassign0005,OTT platform and its impact on the entertainme...,0,19.789474,94.402036,45.676604,393,892,7.094148
...,...,...,...,...,...,...,...,...,...
93,blackassign0096,Due to the COVID-19 the repercussion of the en...,1,24.100000,98.715891,49.126356,623,1418,6.963082
94,blackassign0097,Impact of COVID-19 pandemic on office space an...,1,32.270270,98.821218,52.436595,509,1042,6.400786
95,blackassign0098,Contribution of handicrafts (Visual Arts & Lit...,0,32.200000,100.000000,52.880000,92,195,6.500000
96,blackassign0099,How COVID-19 is impacting payment preferences?,0,22.185185,96.938776,47.649584,294,566,6.078231


In [81]:
# Provide instructions for running the .py file
instructions = """
Instructions:
1. Ensure Python is installed on your system.
2. Install required libraries by running: pip install pandas beautifulsoup4 requests nltk syllapy
3. Place the input.xlsx file containing URLs in the same directory as this script.
4. Run this script using Python: python text_analysis.py
5. After execution, the output will be saved in output.xlsx file.
"""

# Save instructions to a text file
with open('instructions.txt', 'w') as file:
    file.write(instructions)