## Data loading

In [1]:
import pandas as pd

# Read the input Excel file
file_path = '/content/drive/MyDrive//black_offer/Input.xlsx'
df = pd.read_excel(file_path)

# Display the dataframe to understand its structure
df.head()


Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


## Data extraction

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def extract_article_text(url):
    # Send a request to the URL
    response = requests.get(url)

    if response.status_code != 200:
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title and text of the article
    title = soup.find('h1').get_text() if soup.find('h1') else ''
    article_body = soup.find('div', class_='td-post-content')

    if not article_body:
        return None

    paragraphs = article_body.find_all('p')
    article_text = '\n'.join([para.get_text() for para in paragraphs])

    return title + '\n\n' + article_text



In [4]:
# Example usage for a single URL
url = 'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/'
print(extract_article_text(url))

Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040.

We have seen a huge development and dependence of people on technology in recent years. We have also seen the development of AI and ChatGPT in recent years. So it is a normal thing that we will become fully dependent on technology by 2040. Information technology will be a major power for all the developing nations. As a member of a developing nation, India is rapidly growing its IT base. It has also grown some IT cities which will be the major control centres for Information technology by 2040.
Rising IT cities
Kolkata:- Kolkata in West Bengal is an emerging major IT hub. The new Kolkata i.e. Saltlake Sector  5, New town, Rajarhat area of Kolkata is a major IT hub. The government is giving the software companies land at almost free of cost to set up the companies there. Many large companies like Google, Microsoft, IBM, Infosys and others have set up their companies here. Kolkat

In [5]:
import os

# Create an output directory if it doesn't exist
output_dir = '/content/drive/MyDrive/articles'

os.makedirs(output_dir, exist_ok=True)



In [6]:
# Process each URL and save the articles
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    article_text = extract_article_text(url)

    file_path = os.path.join(output_dir, f'{url_id}.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        if article_text:
            file.write(article_text)
        else:
            file.write('')




## Analysis

In [7]:
!pip install syllapy

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [14]:
!pip install textblob



In [16]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import syllapy
from textblob import TextBlob # Import TextBlob

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Load stop words
stop_words = set(stopwords.words('english'))

# Cleaning text using stop words
def clean_text(text):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]
    return cleaned_words

# Custom function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("es") or word.endswith("ed"):
        count -= 1
    if count == 0:
        count += 1
    return count

# Function to calculate the required textual analysis metrics
def analyze_text(text):
    cleaned_words = clean_text(text)

    # Calculate word count
    word_count = len(cleaned_words)

    # Calculate positive and negative scores using TextBlob
    positive_score = sum(1 for word in cleaned_words if TextBlob(word).sentiment.polarity > 0)
    negative_score = sum(1 for word in cleaned_words if TextBlob(word).sentiment.polarity < 0)

    # Calculate polarity and subjectivity scores
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

    # Calculate average sentence length
    sentences = sent_tokenize(text)
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences) if sentences else 0

    # Calculate complex words and percentage of complex words
    complex_words = [word for word in cleaned_words if count_syllables(word) >= 3]
    complex_word_count = len(complex_words)
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count else 0

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate average number of words per sentence
    avg_words_per_sentence = avg_sentence_length

    # Calculate syllables per word
    syllable_per_word = sum(count_syllables(word) for word in cleaned_words) / word_count if word_count else 0

    # Calculate personal pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

    # Calculate average word length
    avg_word_length = sum(len(word) for word in cleaned_words) / word_count if word_count else 0

    return [
        positive_score, negative_score, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex_words, fog_index,
        avg_words_per_sentence, complex_word_count, word_count,
        syllable_per_word, personal_pronouns, avg_word_length
    ]


In [18]:
# List to hold the results
results = []

# Process each article text file
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    file_path = os.path.join(output_dir, f'{url_id}.txt')
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            analysis_results = analyze_text(text)
            results.append([url_id, url] + analysis_results)

# Create a DataFrame with the results
columns = [
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
    'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
    'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]
results_df = pd.DataFrame(results, columns=columns)



In [19]:
results_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,21,2,0.826087,0.127072,15.800000,25.414365,16.485746,15.800000,46,181,2.055249,4,6.320442
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,59,21,0.475000,0.097324,21.155844,37.591241,23.498834,21.155844,309,822,2.335766,6,7.216545
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,33,12,0.466667,0.069876,21.803571,51.086957,29.156211,21.803571,329,644,2.678571,13,8.072981
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,32,22,0.185185,0.083077,23.745098,46.307692,28.021116,23.745098,301,650,2.513846,5,7.835385
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,17,1,0.888889,0.045570,19.589744,32.658228,20.899189,19.589744,129,395,2.230380,6,7.129114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,43,25,0.264706,0.110211,24.280000,36.628849,24.363540,24.280000,226,617,2.243112,4,7.025932
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,24,24,0.000000,0.094303,31.710526,23.772102,22.193051,31.710526,121,509,2.033399,7,6.449902
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,2,0,1.000000,0.020202,34.800000,28.282828,25.233131,34.800000,28,99,2.010101,0,6.606061
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,15,3,0.666667,0.062069,21.642857,27.241379,19.553695,21.642857,79,290,2.093103,4,6.213793


In [20]:
# Save the results to an Excel file
output_file_path = '/content/drive/MyDrive/output.xlsx'
results_df.to_excel(output_file_path, index=False)
