In [1]:
# Installing required libraries

import requests
from bs4 import BeautifulSoup
import nltk as nltk
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import re

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ual-laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Downloading NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\ual-
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\ual-
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup

# the health website which I have chosen
base_url = 'https://www.1mg.com/'

def get_page_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        response = session.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        text_content = soup.get_text()
        return text_content
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None

def get_all_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        new_links = []
        for link in links:
            href = link['href']
            if href.startswith('http'):
                new_links.append(href)
            elif href.startswith('/'):
                new_links.append('https://www.1mg.com' + href)
        return new_links
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return []


def scrape_website(base_url, max_pages=100):
    visited_urls = set()
    pages_to_visit = [base_url]
    pages_scraped = 0
    text_collection = []

    while pages_to_visit and pages_scraped < max_pages:
        current_url = pages_to_visit.pop(0)

        if current_url in visited_urls:
            continue

        text_content = get_page_content(current_url)
        if text_content:
            print(f"Scraping {current_url}")
            text_collection.append(text_content)

            # Marking the current URLs as visited only if successfully scraped - important
            visited_urls.add(current_url)

            page_links = get_all_links(current_url)
            if page_links:
                pages_to_visit.extend(page_links)

            pages_scraped += 1

    return text_collection

result_text_collection = scrape_website(base_url, max_pages=100)

Scraping https://www.1mg.com/
Scraping https://www.1mg.com/labs
Scraping https://www.1mg.com/online-doctor-consultation
Scraping https://www.1mg.com/cancer-care/home
Scraping https://www.1mg.com/ayurveda
Scraping https://www.1mg.com/subscription-plan/subscriptions
Scraping https://www.1mg.com/offers
Scraping https://www.1mg.com/help
Scraping https://www.1mg.com/order-with-prescription
Scraping https://www.1mg.com/all-diseases
Scraping https://www.1mg.com/drugs-all-medicines
Scraping https://www.1mg.com/drugs-therapeutic-classes
Scraping https://www.1mg.com/categories/fitness-supplements-5
Scraping https://www.1mg.com/categories/fitness-supplements/vitamins-minerals-9
Scraping https://www.1mg.com/categories/fitness-supplements/vitamins-minerals/multivitamins-162
Scraping https://www.1mg.com/categories/vitamin-supplements/vitamin-d-121
Scraping https://www.1mg.com/categories/exclusive/immunity-boosters/vitamin-c-734
Scraping https://www.1mg.com/categories/fitness-supplements/vitamins-min

In [9]:
def scrape_and_calculate_statistics(urls_and_texts):
    all_statistics = []
    total_statistics = {
        "num_sentences": 0,
        "num_words": 0,
        "avg_word_length": 0,
        "lexical_diversity": 0,
        "percent_stop_words": 0,
        "noun_count": 0,
    }

    for url, text1 in urls_and_texts:
        # Skip if text1 is None (indicating an issue with fetching the page content)
        if text1 is None:
            print(f"Skipping {url} due to missing page content.")
            continue
        original_sentence = sent_tokenize(text1)
        text1_clean= re.sub(r'[^\w\s]', '', text1 ) # Remove punctuation

        # Tokenize the text
        sentences = sent_tokenize(text1_clean.lower())
        words = word_tokenize(text1_clean.lower())



        # Remove stop words and calculate the percentage of stop words
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]
        percent_stop_words = (len(words) - len(filtered_words)) / len(words) * 100 if len(words) > 0 else 0

        # Calculate NLP statistics
        num_sentences = len(original_sentence)
        num_words = len(words)

        # Calculate average word length
        avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0

        # Calculate lexical diversity
        lexical_diversity = len(set(words)) / num_words if num_words > 0 else 0

        # Calculate the frequency distribution of words
        fdist = FreqDist(filtered_words)
        common_words = fdist.most_common(5)  # Get the 5 most common words

        # # Remove stop words and calculate the percentage of stop words
        # stop_words = set(stopwords.words('english'))
        # filtered_words = [word for word in words if word.lower() not in stop_words]
        # percent_stop_words = (len(words) - len(filtered_words)) / len(words) * 100 if len(words) > 0 else 0

        # Part of Speech tagging
        pos_tags = pos_tag(filtered_words)
        noun_count = sum(1 for word, pos in pos_tags if pos in ['NN', 'NNS'])

        # Store the results in a file
        with open('nlp_statistics.txt', 'a') as file:
            file.write(f"URL: {url}\n")
            file.write(f"Number of Sentences: {num_sentences}\n")
            file.write(f"Number of Words: {num_words}\n")
            file.write(f"Average Word Length: {avg_word_length:.2f}\n")
            file.write(f"Lexical Diversity: {lexical_diversity:.2%}\n")
            file.write(f"Common Words: {common_words}\n")
            file.write(f"Percentage of Stop Words: {percent_stop_words:.2f}%\n")
            file.write(f"Noun Count: {noun_count}\n")
            file.write("\n")  # Add a separator between entries

        statistics = {
            "url": url,
            "num_sentences": num_sentences,
            "num_words": num_words,
            "avg_word_length": avg_word_length,
            "lexical_diversity": lexical_diversity,
            "common_words": common_words,
            "percent_stop_words": percent_stop_words,
            "noun_count": noun_count,
        }

        all_statistics.append(statistics)

        total_statistics["num_sentences"] += statistics["num_sentences"]
        total_statistics["num_words"] += statistics["num_words"]
        total_statistics["avg_word_length"] += statistics["avg_word_length"]
        total_statistics["lexical_diversity"] += statistics["lexical_diversity"]
        total_statistics["percent_stop_words"] += statistics["percent_stop_words"]
        total_statistics["noun_count"] += statistics["noun_count"]

        num_pages = len(urls_and_texts)
        if num_pages > 0:
            total_statistics["avg_word_length"] /= num_pages
            total_statistics["lexical_diversity"] /= num_pages
            total_statistics["percent_stop_words"] /= num_pages

    return all_statistics,total_statistics


def write_aggregated_results_to_file(filename, total_statistics):
    with open(filename, 'w') as file:
        file.write("Aggregated Results\n")
        file.write("==================\n")
        file.write(f"Average Number of Sentences: {total_statistics['num_sentences']}\n")
        file.write(f"Average Number of Words: {total_statistics['num_words']}\n")
        file.write(f"Average Average Word Length: {total_statistics['avg_word_length']:.2f}\n")
        file.write(f"Average Lexical Diversity: {total_statistics['lexical_diversity']:.2%}\n")
        file.write(f"Average Percentage of Stop Words: {total_statistics['percent_stop_words']:.2f}%\n")
        file.write(f"Average Noun Count: {total_statistics['noun_count']}\n")

In [10]:
# Assuming result_text_collection is a list of text content
text_collection_size = len(result_text_collection)
urls = [f"{base_url}{i}" for i in range(1, text_collection_size + 1)]

# Combine URLs and corresponding text content into a list of tuples
urls_and_texts = list(zip(urls, result_text_collection))

# Call the function with the list of tuples
all_statistics,total_stats = scrape_and_calculate_statistics(urls_and_texts)
write_aggregated_results_to_file('aggregated_results.txt', total_stats)

print(all_statistics)


# Now 'all_statistics' contains a list of dictionaries with the calculated statistics for each URL.
# You can use this list as needed.

[{'url': 'https://www.1mg.com/1', 'num_sentences': 4, 'num_words': 881, 'avg_word_length': 16.68104426787741, 'lexical_diversity': 0.7911464245175936, 'common_words': [('skin', 9), ('health', 9), ('care', 8), ('10pm', 7), ('collagen', 6)], 'percent_stop_words': 7.604994324631101, 'noun_count': 471}, {'url': 'https://www.1mg.com/2', 'num_sentences': 56, 'num_words': 1574, 'avg_word_length': 5.517789072426938, 'lexical_diversity': 0.3761118170266836, 'common_words': [('lab', 36), ('1mg', 30), ('tests', 28), ('test', 19), ('health', 17)], 'percent_stop_words': 34.49809402795426, 'noun_count': 533}, {'url': 'https://www.1mg.com/3', 'num_sentences': 1, 'num_words': 17, 'avg_word_length': 7.647058823529412, 'lexical_diversity': 0.9411764705882353, 'common_words': [('doctor', 2), ('online', 1), ('consultation', 1), ('india', 1), ('247', 1)], 'percent_stop_words': 5.88235294117647, 'noun_count': 11}, {'url': 'https://www.1mg.com/4', 'num_sentences': 29, 'num_words': 1033, 'avg_word_length': 7.

In [11]:
print(total_stats)

{'num_sentences': 1798, 'num_words': 160511, 'avg_word_length': 0.1305124752700487, 'lexical_diversity': 0.006477044665663199, 'percent_stop_words': 0.12954536659820068, 'noun_count': 78037}


In [10]:
#THE END