In [1]:
!pip --version

pip 21.0.1 from /shared/centos7/anaconda3/2021.05/lib/python3.8/site-packages/pip (python 3.8)


In [2]:
# Installing the langid library for language identification

!pip install langid

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import langid  
import pandas as pd 
import json  
from bs4 import BeautifulSoup, Comment  
import requests  
import threading  

In [6]:
# Loading the dataset containing URLs and their categories

df = pd.read_csv("URL Classification.csv")

In [11]:
# Dropping rows where the category is labeled as "Adult"

i = df[(df.Adult=="Adult")].index
df = df.drop(i)

In [15]:
# Checking the unique categories in the dataset

df.Adult.unique()

array(['Arts', 'Business', 'Computers', 'Games', 'Health', 'Home', 'Kids',
       'News', 'Recreation', 'Reference', 'Science', 'Shopping',
       'Society', 'Sports'], dtype=object)

In [16]:
# Total number of elements in the dataset

df.size

4582959

In [17]:
# Total number of rows in the dataset

len(df.index)

1527653

In [18]:
# Renaming columns for better readability

df = df.rename(columns={'http://www.liquidgeneration.com/': "URL", "Adult": "category"})

In [19]:
# Defining custom headers for HTTP requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.9999.999 Safari/537.36',
    'Referer': 'https://example.com'
}

# Function to extract HTML content from a webpage

def extract_html_from_doc(url, category):
    try:
        # Sending HTTP request to the URL with custom headers
        response = requests.get(url, headers=headers)
        content_data = {}
         
        # Parsing the HTML response using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Removing HTML comments from the parsed HTML content
        clean_soup = get_cleaned_html(soup)
        
        # Checking if the length of the HTML content is greater than 10,000 characters
        
        if len(soup.prettify())>10000:
            language, _ = langid.classify(clean_soup.get_text())

            # Proceeding only if the language is identified as English
            if language == 'en':

                # Extracting style tags and title from the HTML content
                style_tags = soup.find_all('style')
                title = soup.title.text.strip()

                # Proceeding only if style tags and title are present
                if len(style_tags)>0 and title:
                    content_data['prompt'] = "create a " + category + " website for " + title
                    content_data['output'] = soup.prettify().replace('\n', '')
                    return content_data
    except:
        return None


# Function to remove HTML comments from parsed HTML content

def get_cleaned_html(soup):
    comments = soup.find_all(string=lambda string: isinstance(string, Comment))
    for comment in comments:
        comment.extract()
    return soup

In [20]:
df.head()

Unnamed: 0,1,URL,category
35324,35326,http://www.awn.com,Arts
35325,35327,http://animation.about.com/,Arts
35326,35328,http://www.toonhound.com,Arts
35327,35329,http://www.geocities.com/d-patanella/,Arts
35328,35330,http://enculturation.gmu.edu/2_1/pisters.html,Arts


In [21]:
# Function to concurrently extract data from multiple URLs

def getData(df, num, f):
    output_data = []
    for d in range(num,num+1000):
        url = df.iloc[d]['URL']
        category = df.iloc[d]['category']
        output = extract_html_from_doc(url, category)
        if output != None:
            output_data.append(output)

        # Printing progress for every 25 URLs processed
        if d%25 == 0:
            print(d)
        json_data = json.dumps(output_data, indent=4)
        
    # Writing extracted data to a JSON file
    with open('data'+f+'.json', 'w') as json_file:
        json_file.write(json_data)


In [22]:
# Creating threads for concurrent execution of data extraction tasks

t1 = threading.Thread(target=getData, args=(df,1000000,"1",))
t2 = threading.Thread(target=getData, args=(df,1001000,"2",))
t3 = threading.Thread(target=getData, args=(df,1002000,"3",))
t4 = threading.Thread(target=getData, args=(df,1003000,"4",))
t5 = threading.Thread(target=getData, args=(df,1004000,"5",))

In [23]:
# Starting the threads for concurrent execution

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()