In [None]:
!pip install bs4


import requests
from bs4 import BeautifulSoup

import pandas as pd
import re

import time
from IPython.display import clear_output


clear_output()

In [None]:
url_prefix = 'https://catalog.data.gov/dataset'
total_data = 22000
pages = int(total_data / 20)

print(f'pages = {pages}')

In [None]:
def get_docs_by_page(url):
    r = requests.get(url)
    time.sleep(3)
    r.encoding = "UTF-8"

    soup = BeautifulSoup(r.text,'html.parser')
    dataset_blocks = soup.find_all(class_="dataset-heading")

    dataset_headings = []
    for dataset_block in dataset_blocks:
        dataset_heading = dataset_block.get_text()[1:-1]
        dataset_headings.append(dataset_heading)

    return dataset_headings


def get_all_pages(first_page, last_page, url):
    data = []
    print(f'page number from {first_page} to {(last_page)}')
    for page_number in range(first_page, last_page+1):
        print(f'page number: {page_number}', end='\r')
        url =  url_prefix + '?page=' + str(page_number)
        docs = get_docs_by_page(url) 
        data.extend(docs)
        
    print(f'page number: {page_number}')
    print(f'len(data) = {len(data)}')
    
    return data

In [None]:
%%time

additional_gov_datasets_popular = get_all_pages(1, pages, url_prefix)

additional_gov_datasets_popular = pd.DataFrame(additional_gov_datasets_popular, columns=['title'])
additional_gov_datasets_popular.head()

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
additional_gov_datasets_popular['title'] = additional_gov_datasets_popular['title'].apply(text_cleaning)

additional_gov_datasets_popular.to_csv(f'additional_gov_datasets_{total_data}popular.csv', index=False)
additional_gov_datasets_popular.head()

In [None]:
adnl_govt_labels = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
adnl_govt_labels.head()

In [None]:
for final_row in range(2000, total_data+1, 2000):
    data_set_800_with_popular = pd.concat([adnl_govt_labels, 
                                           additional_gov_datasets_popular.iloc[:final_row]]).reset_index(drop=True)
    data_set_800_with_popular.to_csv(f'data_set_800_with{final_row}popular.csv', index=False)
    print(f'data_set_800_with{final_row}popular.csv is saved with len = {len(data_set_800_with_popular)}')

data_set_800_with_popular