In [1]:
import pandas as pd  # for data preprocessing and read csv files
import numpy as np  # for numerical computation and manipulation 

from bs4 import BeautifulSoup   # Importing BeautifulSoup from the bs4 library for parsing HTML and XML documents.
import requests        # Importing the requests library for making HTTP requests to retrieve web content.

import os       # Importing the os library for interacting with the operating system, such as file and directory operations.
import nltk     # Importing the nltk library Natural Language Toolkit, a powerful tool for working with human language data (text) in Python.
from nltk.tokenize import RegexpTokenizer, sent_tokenize     # Importing RegexpTokenizer from nltk.tokenize to tokenize text using regular expressions.
                                    # This is useful for creating custom tokenization patterns, such as splitting text by specific characters or patterns.

In [2]:
# Google Sheets link
sheet_url = "https://docs.google.com/spreadsheets/d/1D7QkDHxUSKnQhR--q0BAwKMxQlUyoJTQ/edit?usp=drive_link&ouid=115129819942252505059&rtpof=true&sd=true"

# Convert it to a CSV export link
csv_export_url = sheet_url.replace('/edit?usp=drive_link&ouid=115129819942252505059&rtpof=true&sd=true', '/export?format=csv')

# Read the CSV into a DataFrame
df = pd.read_csv(csv_export_url)
df

Unnamed: 0,URL_ID,URL
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...
3,bctech2014,https://insights.blackcoffer.com/effective-man...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...
...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...


In [3]:
def extract(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,"html.parser")
    title = soup.find("h1").text.strip()
    content = [i.text.strip() for i in soup.find_all("p",class_ = None)]
    return title, content


# # notes -------------------------------------------------------------------- just uper wale ka -----

# - parsing the HTML Content:
# soup = BeautifulSoup(page.text, 'html.parser'):

# This line uses BeautifulSoup to parse the HTML content of the page.
# page.text contains the raw HTML of the page.
# 'html.parser' is specified as the parser, which is a built-in parser in Python. It helps in navigating and manipulating HTML data.
# - Why 'html.parser'?:
# The html.parser tells BeautifulSoup how to parse the raw HTML text. There are other parsers available, like lxml or html5lib, but html.parser is a simple and effective choice for basic tasks.

# - Extracting the Title:
# title = soup.find('h1').text.strip():

# This line finds the first <h1> tag in the HTML document.
# The text attribute extracts the text content inside the <h1> tag.
# .strip() is used to remove any leading or trailing whitespace from the text.
# - Why <h1>?:
# The <h1> tag is typically used to define the main heading or title of a web page. By extracting the text inside this tag, you often get the primary title of the page.

# - Extracting the Paragraphs-
# content = [i.text.strip() for i in soup.find_all('p', class_=None)]:

# This line finds all <p> tags in the HTML that do not have a class attribute.
# The find_all('p', class_=None) function returns a list of all <p> elements matching the criteria.
# The for loop iterates over these elements, extracts the text with i.text.strip(), and stores them in the list content.
# - Why <p> and class_=None?:
# <p> tags generally contain paragraphs of text, which are key parts of web content.
# Specifying class_=None ensures that only paragraphs without a specific class are selected, potentially filtering out less relevant content.


In [None]:
for index, row in df.iterrows(): # For Loop with iterrows()
    url = row["URL"]   #Extracting URL and URL_ID
    url_id = row['URL_ID']

    
    # Print the URL ID
    print(url_id)

    
    # Extract title and text from the URL
    title, text = extract(url)

    
    # Check if the URL was reachable 
    if title is None:
        print(f"{j} is not reachable")    #formatted string literal, commonly known as an f-string. F-strings 
        continue
        file = f"{j}.txt"    # agr title none hai to not reachable print hoga , oor agr snone nahi hooga to continue rakhega....file ka naam dega....Create the filename based on the URL ID -       


    # Write the title and text to the file - file ke ander title or text -
    try:
        with open(file, 'w+', encoding='utf-8') as f:         # write mode ('w+')
            f.write(title + '\n')        # Writing the Title -This writes the title of the page to the file, followed by a newline ('\n') to separate it from the text.
           
            for line in text:       #  Writing Each Line of Text
                f.write(line + '\n')    # Purpose: This loop writes each paragraph of text (each item in the text list) to the file, followed by a newline
                
    except IOError as e:    # Exception Handling
        print(f"Failed to write file {file}: {e}")   # ye error message dega ...


# The term utf-8 refers to a character encoding that is widely used in computing ythe text in a unique no. for every characters. It stands for "8-bit Unicode Transformation Format"

#  try is used to test a block of code for errors.
# If an error occurs, the code inside the except block runs instead of crashing the program.
# In your case, it's used to safely handle potential errors when writing to a file, ensuring the program can continue or provide a meaningful error message if something goes wrong.

In [4]:
# Initialize NLTK resources
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
pronouns = {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her", "his", "hers", "its", "theirs", "our", "your"}

# Directory path where the text files are located
directory_path = 'C:\\Users\\Admin\\Desktop\\Blackcoffer_internship'                  

# Define paths for the positive and negative words files
positive_words_path = os.path.join(directory_path, 'positive-words.txt')
negative_words_path = os.path.join(directory_path, 'negative-words.txt')

# Global Variables: Variables declared outside functions that can be accessed anywhere in the code......isi liye function me ham kahi bhi dubara nahi diye hai file ka path ...ek baar upar de do syster automaticaly...sab kar lega....


def read_words(file_path): # in place of file_path we can give any sting.
    """Helper function to read and tokenize words from a file."""
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    return set(tokenizer.tokenize(text.lower()))

def positive_score(file):  # in place of file we can give any sting.
    """Calculate the positive score for the given file."""
    if not os.path.isfile(file):
        print(f"File not found: {file}")
        return 0
    
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    words = tokenizer.tokenize(text.lower())
    
    positive_words = read_words(positive_words_path)
    
    count = sum(1 for word in words if word in positive_words)
    return count

def negative_score(file):
    """Calculate the negative score for the given file."""
    if not os.path.isfile(file):
        print(f"File not found: {file}")
        return 0
    
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    words = tokenizer.tokenize(text.lower())
    
    negative_words = read_words(negative_words_path)
    
    count = sum(-1 for word in words if word in negative_words)
    return count

def polarity(file):
    """Calculate the polarity score for the given file."""
    pos_score = positive_score(file)
    neg_score = negative_score(file)
    return (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)

def subjectivity(file):
    """Calculate the subjectivity score for the given file."""
    pos_score = positive_score(file)
    neg_score = negative_score(file)
    total_words = len(tokenizer.tokenize(open(file, 'r', encoding='utf-8', errors='ignore').read().lower()))
    return (pos_score + neg_score) / (total_words + 0.000001)

def calculate_metrics(file_id):
    """Calculate all metrics for a given file ID."""
    file_path = os.path.join(directory_path, file_id + '.txt')  # Ensure .txt extension
    
    # Initialize metrics
    pos_score = positive_score(file_path)
    neg_score = negative_score(file_path)
    pol_score = polarity(file_path)
    subj_score = subjectivity(file_path)
    
    # Read the file
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    
    # Tokenize words and sentences
    words = tokenizer.tokenize(text)
    filtered_words = [w for w in words if w.lower() not in stop_words]
    sentences = sent_tokenize(text)
    
    # Calculate average sentence length
    sentence_count = len(sentences)
    word_count = len(filtered_words)
    avg_sentence_length = word_count / sentence_count if sentence_count else 0
    
    # Calculate complex words
    vowels = 'aeiouy'
    count_complex_words = 0
    for word in filtered_words:
        word = word.lower()
        count = 0
        if word[0] in vowels:
            count += 1
        for i in range(1, len(word)):
            if word[i] in vowels and word[i - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1
        if count > 2:
            count_complex_words += 1
    
    percentage_complex_words = count_complex_words / len(filtered_words) if len(filtered_words) else 0
    
    # Calculate FOG index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words) if avg_sentence_length else 0
    
    # Calculate average number of words per sentence
    total_words = len(words)
    avg_words_per_sentence = total_words / sentence_count if sentence_count else 0
    
    # Calculate syllables per word
    total_syllables = 0
    for word in words:
        word = word.lower()
        count = 0
        if word[0] in vowels:
            count += 1
        for i in range(1, len(word)):
            if word[i] in vowels and word[i - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1
        total_syllables += count
    
    syllables_per_word = total_syllables / len(words) if words else 0
    
    # Calculate personal pronouns count
    personal_pronouns_count = sum(1 for word in words if word.lower() in pronouns)
    
    # Calculate average word length
    total_length = sum(len(word) for word in words)
    avg_word_length = total_length / len(words) if words else 0
    
    # Return metrics as a dictionary
    return {
        'POSITIVE SCORE': pos_score,
        'NEGATIVE SCORE': neg_score,
        'POLARITY SCORE': pol_score,
        'SUBJECTIVITY SCORE': subj_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': count_complex_words,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns_count,
        'AVG WORD LENGTH': avg_word_length
    }

# Initialize an empty DataFrame for output
df_output = pd.DataFrame(columns=[
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
    'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])

def calculate_set(file_id, index):
    metrics = calculate_metrics(file_id)  # Get metrics for the file
    
    # Store the calculated values in the df_output DataFrame
    df_output.loc[index, 'URL_ID'] = file_id
    df_output.loc[index, 'URL'] = df.loc[index, 'URL']
    df_output.loc[index, 'POSITIVE SCORE'] = metrics['POSITIVE SCORE']
    df_output.loc[index, 'NEGATIVE SCORE'] = metrics['NEGATIVE SCORE']
    df_output.loc[index, 'POLARITY SCORE'] = metrics['POLARITY SCORE']
    df_output.loc[index, 'SUBJECTIVITY SCORE'] = metrics['SUBJECTIVITY SCORE']
    df_output.loc[index, 'AVG SENTENCE LENGTH'] = metrics['AVG SENTENCE LENGTH']
    df_output.loc[index, 'PERCENTAGE OF COMPLEX WORDS'] = metrics['PERCENTAGE OF COMPLEX WORDS']
    df_output.loc[index, 'FOG INDEX'] = metrics['FOG INDEX']
    df_output.loc[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = metrics['AVG NUMBER OF WORDS PER SENTENCE']
    df_output.loc[index, 'COMPLEX WORD COUNT'] = metrics['COMPLEX WORD COUNT']
    df_output.loc[index, 'WORD COUNT'] = metrics['WORD COUNT']
    df_output.loc[index, 'SYLLABLE PER WORD'] = metrics['SYLLABLE PER WORD']
    df_output.loc[index, 'PERSONAL PRONOUNS'] = metrics['PERSONAL PRONOUNS']
    df_output.loc[index, 'AVG WORD LENGTH'] = metrics['AVG WORD LENGTH']

# Process each file
for index, row in df.iterrows():
    file_id = row.get('URL_ID', 'Default_ID')  # Use .get() with a default value if key might be missing
    print(f"Processing URL_ID: {file_id}")
    try:
        calculate_set(file_id, index)
    except FileNotFoundError as e:
        print(e)

# Display the DataFrame
#print(df_output)

# Save the DataFrame to an Excel file
#df_output.to_excel('Output.xlsx', index=False)  # Save as Excel file

# Optionally, save as CSV file
df_output.to_csv('Output.csv', index=False)  # Save as CSV file


Processing URL_ID: bctech2011
Processing URL_ID: bctech2012
Processing URL_ID: bctech2013
Processing URL_ID: bctech2014
Processing URL_ID: bctech2015
Processing URL_ID: bctech2016
Processing URL_ID: bctech2017
Processing URL_ID: bctech2018
Processing URL_ID: bctech2019
Processing URL_ID: bctech2020
Processing URL_ID: bctech2021
Processing URL_ID: bctech2022
Processing URL_ID: bctech2023
Processing URL_ID: bctech2024
Processing URL_ID: bctech2025
Processing URL_ID: bctech2026
Processing URL_ID: bctech2027
Processing URL_ID: bctech2028
Processing URL_ID: bctech2029
Processing URL_ID: bctech2030
Processing URL_ID: bctech2031
Processing URL_ID: bctech2032
Processing URL_ID: bctech2033
Processing URL_ID: bctech2034
Processing URL_ID: bctech2035
Processing URL_ID: bctech2036
Processing URL_ID: bctech2037
Processing URL_ID: bctech2038
Processing URL_ID: bctech2039
Processing URL_ID: bctech2040
Processing URL_ID: bctech2041
Processing URL_ID: bctech2042
Processing URL_ID: bctech2043
Processing

In [5]:
df_output

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,159,-26,1.390977,0.167506,28.0,0.421429,11.368571,39.7,236,560,1.982368,6,5.647355
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,8,-3,2.2,0.05618,24.0,0.319444,9.727778,29.666667,23,72,1.921348,1,6.067416
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,11,-3,1.75,0.087912,24.0,0.375,9.75,30.333333,27,72,1.934066,3,5.989011
3,bctech2014,https://insights.blackcoffer.com/effective-man...,9,-4,2.599999,0.054945,24.0,0.375,9.75,30.333333,27,72,1.967033,2,6.021978
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,9,-3,2.0,0.066667,24.333333,0.383562,9.886758,30.0,28,73,1.988889,1,6.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,138,-40,1.816327,0.115159,17.323529,0.317487,7.056407,25.029412,187,589,1.836663,5,5.204465
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,254,-57,1.57868,0.148679,13.609375,0.299656,5.563612,20.703125,261,871,1.792453,13,5.273208
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,51,-17,2.0,0.141667,12.363636,0.183824,5.018984,21.818182,25,136,1.525,11,4.5875
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,0,0,0.0,0.0,16.0,0.375,6.55,16.0,6,16,3.125,0,9.1875
