# Dataset Creation
In this notebook, I will create a dataset that will be used for training the website classifier.
The website collection with labelled categories is collected from https://www.kaggle.com/patanjalichin1/dmoztoolsnet-categorical-data  
I will use BeautifulSoup to parse the HTML content of the page and use requests library to fetch the page

In [1]:
from bs4 import BeautifulSoup
import bs4 as bs4
from urllib.parse import urlparse
import requests
from collections import Counter
import pandas as pd
import os

## Building a Scraper Class that matches our requirement

In [13]:
class ScraperTool:
    def visit_url(self, website_url):
        '''
        Visit URL. Download the Content. Initialize the beautifulsoup object. Call parsing methods. Return Series object.
        '''
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
        content = requests.get(website_url,headers=headers,timeout=10).content
        
        #lxml is apparently faster than other settings.
        soup = BeautifulSoup(content, "lxml")
        result = {
            "website_url": website_url,
            "website_name": self.get_website_name(website_url),
            "title_tag_content": self.get_html_title_tag(soup),
            "meta_tag_content": self.get_html_meta_tags(soup),
            "headings_content": self.get_html_heading_tags(soup),
            "html_text_content": self.get_text_content(soup)
        }
        #get_tag_count returns a dynamic dictionary. That is why we update the dictionary with a separate update command.
        result.update(self.get_tag_count(soup))
        
        #Convert to Series object and return
        return pd.Series(result)
    
    def get_website_name(self,website_url):
        '''
        Example: returns "google" from "www.google.com"
        '''
        return "".join(urlparse(website_url).netloc.split(".")[-2])
    
    def get_html_title_tag(self,soup):
        '''Return the text content of <title> tag from a webpage'''
        return '. '.join(soup.title.contents)
    
    def get_html_meta_tags(self,soup):
        '''Returns the text content of <meta> tags related to keywords and description from a webpage'''
        tags = soup.find_all(lambda tag: (tag.name=="meta") & (tag.has_attr('name') & (tag.has_attr('content'))))
        content = [str(tag["content"]) for tag in tags if tag["name"] in ['keywords','description']]
        return ' '.join(content)
    
    def get_html_heading_tags(self,soup):
        '''returns the text content of heading tags. The assumption is that headings might contain relatively important text.'''
        tags = soup.find_all(["h1","h2","h3","h4","h5","h6"])
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return ' '.join(content)
    
    def get_text_content(self,soup):
        '''returns the text content of the whole page with some exception to tags. See tags_to_ignore.'''
        tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]',"h1","h2","h3","h4","h5","h6","noscript"]
        tags = soup.find_all(text=True)
        result = []
        for tag in tags:
            stripped_tag = tag.strip()
            if tag.parent.name not in tags_to_ignore\
                and isinstance(tag, bs4.element.Comment)==False\
                and not stripped_tag.isnumeric()\
                and len(stripped_tag)>0:
                result.append(stripped_tag)
        return ' '.join(result)

    def get_tag_count(self,soup):
        '''returns a dictionary with the frequency of tag for every unique tag found in the page.'''
        tags = soup.find_all()
        return dict(Counter(["tag_count_" + tag.name for tag in tags]))

## Example of using the class to scrape some websites

In [14]:
scraperTool = ScraperTool()
urls = ["https://cnn.com/","https://sastodeal.com"]
series_list = []
for url in urls:
    series_list.append(scraperTool.visit_url(url))
pd.DataFrame(pd.concat(series_list,axis=1,sort=False)).T

Unnamed: 0,website_url,website_name,title_tag_content,meta_tag_content,headings_content,html_text_content,tag_count_html,tag_count_head,tag_count_meta,tag_count_link,...,tag_count_main,tag_count_h2,tag_count_fieldset,tag_count_select,tag_count_option,tag_count_figure,tag_count_h3,tag_count_small,tag_count_h4,tag_count_br
0,https://cnn.com/,cnn,"CNN International - Breaking News, US News, Wo...",Find the latest breaking news and information ...,,World US Politics Business Health Entertainmen...,1,1,22,58,...,,,,,,,,,,
1,https://sastodeal.com,sastodeal,Online shopping in Nepal | Buy online in Nepal...,The biggest online shopping store in kathmandu...,My Cart My Account Real email address is requi...,JavaScript seems to be disabled in your browse...,1,1,9,5,...,1.0,4.0,5.0,1.0,4.0,32.0,2.0,9.0,13.0,3.0


# Combining the list of websites into a dataframe

In [760]:
data_directory = "data/"
files = os.listdir(data_directory)
df_list = []
for file in files:
    df_list.append(pd.read_csv(data_directory+file))
websites_df = pd.concat(df_list,sort=False)

In [761]:
websites_df.shape, websites_df.columns

((825848, 7),
 Index(['Description', 'Title', 'URL', 'Sites', 'CCat', 'MCat', 'SCat'], dtype='object'))

### Saving to Pickle for later user

In [763]:
websites_df.to_pickle("dmoz_websites_collection.pkl")

## Loading data from pickle

In [4]:
websites_df = pd.read_pickle("dmoz_websites_collection.pkl")

In [8]:
websites_df.shape

(825848, 7)

## Let's create a subset of the dataset. Randomly selecting 100 websites from each Main category.

In [5]:
website_main_categories = websites_df["MCat"].unique()

In [20]:
websites_df_subset = []
for category in website_main_categories:
    websites_df_subset.append(websites_df[websites_df["MCat"]==category].sample(100))
websites_df_subset = pd.concat(websites_df_subset,sort=False).reset_index()

In [21]:
len(websites_df_subset["Sites"].unique())

1100

## Let's loop through these websites, scrape them and add them to the dataframe

In [22]:
websites_df_subset.shape

(1100, 8)

In [23]:
series_list = []
for index,row in websites_df_subset.iterrows():
    print("Scraping URL: ",row["Sites"])
    try:
        series_list.append(scraperTool.visit_url(row["Sites"]))
        print("Scraping success = True. Url: ",row["Sites"])
    except Exception as e:
        print("Scraping success = False.Url: ",row["Sites"])
        print(str(e))
        print("\n")

Scraping URL:  http://allreaders.com/movie-review-summary/independence-day-3602
Scraping success = True. Url:  http://allreaders.com/movie-review-summary/independence-day-3602
Scraping URL:  http://www.imdb.com/name/nm0177933/
Scraping success = True. Url:  http://www.imdb.com/name/nm0177933/
Scraping URL:  http://www.carterusm.co.uk/
Scraping success = True. Url:  http://www.carterusm.co.uk/
Scraping URL:  http://www.voicesofafrica.net/
Scraping success = True. Url:  http://www.voicesofafrica.net/
Scraping URL:  http://www.jessebaxter.com/
Scraping success = True. Url:  http://www.jessebaxter.com/
Scraping URL:  http://www.tupac.be/
Scraping success = True. Url:  http://www.tupac.be/
Scraping URL:  http://www.melodymenchorus.org/
Scraping success = True. Url:  http://www.melodymenchorus.org/
Scraping URL:  http://bulgakov.stormloader.com/
Scraping success = True. Url:  http://bulgakov.stormloader.com/
Scraping URL:  http://www.idapostle.com/
Scraping success = True. Url:  http://www.i

Scraping success = False.Url:  http://www.washingtonpost.com/wp-srv/style/longterm/movies/videos/pumpupthevolumerkempley_a0a14d.htm
HTTPConnectionPool(host='www.washingtonpost.com', port=80): Read timed out. (read timeout=10)


Scraping URL:  http://www.haro-online.com/movies/fight_club.html
Scraping success = True. Url:  http://www.haro-online.com/movies/fight_club.html
Scraping URL:  http://www.paperwishes.com/pwideas/
Scraping success = True. Url:  http://www.paperwishes.com/pwideas/
Scraping URL:  http://www.radar-love.net/
Scraping success = True. Url:  http://www.radar-love.net/
Scraping URL:  http://lyrics.rockmagic.net/lyrics/king_diamond/
Scraping success = True. Url:  http://lyrics.rockmagic.net/lyrics/king_diamond/
Scraping URL:  http://markarayner.com/emily/
Scraping success = True. Url:  http://markarayner.com/emily/
Scraping URL:  http://www.aibc.ca/
Scraping success = True. Url:  http://www.aibc.ca/
Scraping URL:  http://www.sapphireswan.com/dance/
Scraping success = Tru

Scraping success = True. Url:  http://www.dataperceptions.co.uk/
Scraping URL:  http://www.iaf-bs.de/
Scraping success = True. Url:  http://www.iaf-bs.de/
Scraping URL:  http://www.datalex.com/
Scraping success = False.Url:  http://www.datalex.com/
HTTPSConnectionPool(host='www.datalex.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.foamtechniques.co.uk/
Scraping success = True. Url:  http://www.foamtechniques.co.uk/
Scraping URL:  http://www.chester-jensen.com/
Scraping success = True. Url:  http://www.chester-jensen.com/
Scraping URL:  http://www.jana-williams.com/
Scraping success = True. Url:  http://www.jana-williams.com/
Scraping URL:  http://www.robomedia.com/
Scraping success = True. Url:  http://www.robomedia.com/
Scraping URL:  http://www.firstib.com/
Scraping success = True. Url:  http://www.firstib.com/
Scrap

Scraping success = True. Url:  http://www.naplesvideography.com/
Scraping URL:  http://www.spongobongo.com/her9997.htm
Scraping success = False.Url:  http://www.spongobongo.com/her9997.htm
HTTPSConnectionPool(host='www.furniture.com', port=443): Max retries exceeded with url: /rugs/guide/oriental/turkmen/ersari (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.sunriseequipments.com/
Scraping success = True. Url:  http://www.sunriseequipments.com/
Scraping URL:  http://www.davidjamesonarchitect.com/
Scraping success = True. Url:  http://www.davidjamesonarchitect.com/
Scraping URL:  http://home.btclick.com/energbco/index.htm
Scraping success = False.Url:  http://home.btclick.com/energbco/index.htm
'NoneType' object has no attribute 'contents'


Scraping URL:  http://www.elwoodstudio.com/
Scraping success = True. Url:  http://www.elwoodstudio.com/
Scraping URL:  http://www.l

Scraping success = True. Url:  http://cristal.inria.fr/whizzytex/
Scraping URL:  http://www.lowgmarketing.com/
Scraping success = True. Url:  http://www.lowgmarketing.com/
Scraping URL:  http://www.pnotepad.org/
Scraping success = True. Url:  http://www.pnotepad.org/
Scraping URL:  http://www.michelleotoole.com/
Scraping success = True. Url:  http://www.michelleotoole.com/
Scraping URL:  http://www.must.de/default.html?cameleon.html
Scraping success = True. Url:  http://www.must.de/default.html?cameleon.html
Scraping URL:  http://www.prgdesign.com/
Scraping success = True. Url:  http://www.prgdesign.com/
Scraping URL:  http://www.iisprotect.com/
Scraping success = True. Url:  http://www.iisprotect.com/
Scraping URL:  http://tools.ietf.org/html/rfc2668
Scraping success = True. Url:  http://tools.ietf.org/html/rfc2668
Scraping URL:  http://www.serveradmins.biz/
Scraping success = False.Url:  http://www.serveradmins.biz/
HTTPConnectionPool(host='www.serveradmins.biz', port=80): Max retrie

Scraping success = True. Url:  http://www.gamespot.com/call-of-duty-2/summary/
Scraping URL:  http://www.eurogamer.net/articles/wolfenstein-preview
Scraping success = True. Url:  http://www.eurogamer.net/articles/wolfenstein-preview
Scraping URL:  http://www.mobygames.com/game/sheet/gameId=1373/
Scraping success = True. Url:  http://www.mobygames.com/game/sheet/gameId=1373/
Scraping URL:  http://www.pro.or.jp/~fuji/java/puzzle/slide/index-eng.html
Scraping success = True. Url:  http://www.pro.or.jp/~fuji/java/puzzle/slide/index-eng.html
Scraping URL:  http://www.zsnes.com/
Scraping success = True. Url:  http://www.zsnes.com/
Scraping URL:  http://www.neoseeker.com/wcwbackstage/
Scraping success = True. Url:  http://www.neoseeker.com/wcwbackstage/
Scraping URL:  http://www.testors.com/
Scraping success = False.Url:  http://www.testors.com/
HTTPSConnectionPool(host='www.testors.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL rou

Scraping success = True. Url:  http://www.hamumu.com/
Scraping URL:  http://activewin.com/xbox/
Scraping success = False.Url:  http://activewin.com/xbox/
HTTPConnectionPool(host='activewin.com', port=80): Read timed out. (read timeout=10)


Scraping URL:  http://www.ukpinballleague.co.uk/
Scraping success = True. Url:  http://www.ukpinballleague.co.uk/
Scraping URL:  http://thunder.prohosting.com/~paranoiz/coding.html#BlackHole
Scraping success = False.Url:  http://thunder.prohosting.com/~paranoiz/coding.html#BlackHole
HTTPConnectionPool(host='thunder.prohosting.com', port=80): Max retries exceeded with url: /~paranoiz/coding.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001686BCCFF28>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))


Scraping URL:  http://www.angelfire.com/fl/chessninja/
Scraping success = False.Url:  http://www.angelfire.com/fl/chessninja/
'NoneType' object has no attribute 'contents'


Scraping URL:

Scraping success = True. Url:  http://cosmixinc.com/
Scraping URL:  http://news.bbc.co.uk/1/hi/health/background_briefings/smoking/281167.stm
Scraping success = True. Url:  http://news.bbc.co.uk/1/hi/health/background_briefings/smoking/281167.stm
Scraping URL:  http://www.lymphomainfo.net/nhl/
Scraping success = True. Url:  http://www.lymphomainfo.net/nhl/
Scraping URL:  http://www.einstein.edu/
Scraping success = True. Url:  http://www.einstein.edu/
Scraping URL:  http://www.multiplesclerosis.com/
Scraping success = True. Url:  http://www.multiplesclerosis.com/
Scraping URL:  http://www.osh.org.il/
Scraping success = True. Url:  http://www.osh.org.il/
Scraping URL:  http://www.simcoemuskokahealth.org/
Scraping success = True. Url:  http://www.simcoemuskokahealth.org/
Scraping URL:  http://www.jackbuckbook.com/homepage.asp
Scraping success = False.Url:  http://www.jackbuckbook.com/homepage.asp
HTTPConnectionPool(host='www.jackbuckbook.com', port=80): Max retries exceeded with url: /hom

Scraping success = True. Url:  http://tosnipornot.blogspot.com/
Scraping URL:  http://www.emedicinehealth.com/heart_attack/article_em.htm
Scraping success = True. Url:  http://www.emedicinehealth.com/heart_attack/article_em.htm
Scraping URL:  http://www.mainenephrology.com/
Scraping success = True. Url:  http://www.mainenephrology.com/
Scraping URL:  http://www.nmdental.org/
Scraping success = True. Url:  http://www.nmdental.org/
Scraping URL:  http://autismcanada.org/
Scraping success = True. Url:  http://autismcanada.org/
Scraping URL:  http://www.abrint.net/
Scraping success = True. Url:  http://www.abrint.net/
Scraping URL:  http://www.healnhc.org/
Scraping success = True. Url:  http://www.healnhc.org/
Scraping URL:  http://www.drbasko.com/
Scraping success = True. Url:  http://www.drbasko.com/
Scraping URL:  http://www.pacificawellness.com/
Scraping success = True. Url:  http://www.pacificawellness.com/
Scraping URL:  http://tmjtalk.yuku.com/
Scraping success = True. Url:  http://

Scraping success = True. Url:  http://www.thenewhomemaker.com/node/555
Scraping URL:  http://www.recipesource.com/soups/soups/09/rec0908.html
Scraping success = False.Url:  http://www.recipesource.com/soups/soups/09/rec0908.html
HTTPSConnectionPool(host='www.recipesource.com', port=443): Max retries exceeded with url: /soups/soups/09/rec0908.html (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.recipesource.com/soups/stews/04/rec0442.html
Scraping success = False.Url:  http://www.recipesource.com/soups/stews/04/rec0442.html
HTTPSConnectionPool(host='www.recipesource.com', port=443): Max retries exceeded with url: /soups/stews/04/rec0442.html (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.gregorylee.com/
Scraping success = True. Url:  http://www.gregoryle

Scraping success = True. Url:  http://www.vrg.org/recipes/egypt.htm
Scraping URL:  http://www.autoblog.com/chrysler/reviews/
Scraping success = True. Url:  http://www.autoblog.com/chrysler/reviews/
Scraping URL:  http://drink.betterrecipes.com/tearecipes.html
Scraping success = True. Url:  http://drink.betterrecipes.com/tearecipes.html
Scraping URL:  http://www.checkdepot.net/
Scraping success = True. Url:  http://www.checkdepot.net/
Scraping URL:  http://www.flowershow.com/
Scraping success = True. Url:  http://www.flowershow.com/
Scraping URL:  http://www.picnicfood-ideas.com/
Scraping success = True. Url:  http://www.picnicfood-ideas.com/
Scraping URL:  http://www.mediterrasian.com/
Scraping success = True. Url:  http://www.mediterrasian.com/
Scraping URL:  http://www.e-rcps.com/pasta/rcp/q_e.shtml
Scraping success = True. Url:  http://www.e-rcps.com/pasta/rcp/q_e.shtml
Scraping URL:  http://www.mildensteins.com/
Scraping success = False.Url:  http://www.mildensteins.com/
HTTPSConne

Scraping success = True. Url:  http://www.mountainx.com/
Scraping URL:  http://www.jacksonsun.com/
Scraping success = True. Url:  http://www.jacksonsun.com/
Scraping URL:  http://www.njherald.com/
Scraping success = True. Url:  http://www.njherald.com/
Scraping URL:  http://www.thehoya.com/
Scraping success = True. Url:  http://www.thehoya.com/
Scraping URL:  http://viewnews.com.au/
Scraping success = True. Url:  http://viewnews.com.au/
Scraping URL:  http://www.martlet.ca/
Scraping success = True. Url:  http://www.martlet.ca/
Scraping URL:  http://www.markovits.com/journalism/jschools.shtml
Scraping success = True. Url:  http://www.markovits.com/journalism/jschools.shtml
Scraping URL:  http://www.thesmokymountaintimes.com/
Scraping success = True. Url:  http://www.thesmokymountaintimes.com/
Scraping URL:  http://www.almanian.org/
Scraping success = False.Url:  http://www.almanian.org/
HTTPConnectionPool(host='www.almanian.org', port=80): Max retries exceeded with url: / (Caused by New

Scraping success = True. Url:  http://greenmfg.me.berkeley.edu/
Scraping URL:  http://www.uwex.edu/disted/
Scraping success = False.Url:  http://www.uwex.edu/disted/
HTTPSConnectionPool(host='www.uwex.edu', port=443): Max retries exceeded with url: /disted/ (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.bandaschool.com/
Scraping success = False.Url:  http://www.bandaschool.com/
HTTPSConnectionPool(host='www.bandaschool.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.biu.ac.il/
Scraping success = True. Url:  http://www.biu.ac.il/
Scraping URL:  http://www.henley.ac.uk/school/real-estate-and-planning/
Scraping success = True. Url:  http://www.henley.ac.uk/school/real-estate-and-planning/
Scraping URL:  htt

Scraping success = True. Url:  http://www.zeuslearning.com/
Scraping URL:  http://www.northeaststate.edu/
Scraping success = True. Url:  http://www.northeaststate.edu/
Scraping URL:  http://www.theatre.msu.edu/
Scraping success = True. Url:  http://www.theatre.msu.edu/
Scraping URL:  http://www.centerforlearning.org/
Scraping success = True. Url:  http://www.centerforlearning.org/
Scraping URL:  http://www.tau.ac.il/law/josebrunner/
Scraping success = True. Url:  http://www.tau.ac.il/law/josebrunner/
Scraping URL:  http://www.uwcad.it/
Scraping success = True. Url:  http://www.uwcad.it/
Scraping URL:  http://www.econ.boun.edu.tr/
Scraping success = False.Url:  http://www.econ.boun.edu.tr/
HTTPSConnectionPool(host='econ.boun.edu.tr', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.capitalcc.edu/
Scraping success = True. Url:  h

Scraping success = True. Url:  http://www.fs.fed.us/database/feis/plants/shrub/salgey/all.html
Scraping URL:  http://northjerseypaleoworld.yuku.com/
Scraping success = False.Url:  http://northjerseypaleoworld.yuku.com/
'NoneType' object has no attribute 'contents'


Scraping URL:  http://www.artifacts.com/
Scraping success = True. Url:  http://www.artifacts.com/
Scraping URL:  http://www.invasivespeciesinfo.gov/
Scraping success = True. Url:  http://www.invasivespeciesinfo.gov/
Scraping URL:  http://en.wikipedia.org/wiki/Cucujidae
Scraping success = True. Url:  http://en.wikipedia.org/wiki/Cucujidae
Scraping URL:  http://www.archaeology.ca/
Scraping success = True. Url:  http://www.archaeology.ca/
Scraping URL:  http://en.wikipedia.org/wiki/Erysiphales
Scraping success = True. Url:  http://en.wikipedia.org/wiki/Erysiphales
Scraping URL:  http://www.tsowell.com/
Scraping success = True. Url:  http://www.tsowell.com/
Scraping URL:  http://www.csuohio.edu/urban/
Scraping success = True. U

Scraping success = True. Url:  http://www.arizonaskiesmeteorites.com/
Scraping URL:  http://curator.jsc.nasa.gov/
Scraping success = True. Url:  http://curator.jsc.nasa.gov/
Scraping URL:  http://cepaosreview.tripod.com/
Scraping success = True. Url:  http://cepaosreview.tripod.com/
Scraping URL:  http://accelrys.com/
Scraping success = True. Url:  http://accelrys.com/
Scraping URL:  http://animaldiversity.org/site/accounts/information/Myrmecophagidae.html
Scraping success = True. Url:  http://animaldiversity.org/site/accounts/information/Myrmecophagidae.html
Scraping URL:  http://www.biodyne-world.com/
Scraping success = True. Url:  http://www.biodyne-world.com/
Scraping URL:  http://www.rricorp.com/
Scraping success = True. Url:  http://www.rricorp.com/
Scraping URL:  http://plants.usda.gov/core/profile?symbol=BIDI
Scraping success = True. Url:  http://plants.usda.gov/core/profile?symbol=BIDI
Scraping URL:  http://www.rook.org/heritage/euro/prehistoric.html
Scraping success = True. U

Scraping success = True. Url:  http://www.candlesetcetera.com/
Scraping URL:  http://www.denimndaisies.com/
Scraping success = True. Url:  http://www.denimndaisies.com/
Scraping URL:  http://www.plantijnmaps.com/
Scraping success = True. Url:  http://www.plantijnmaps.com/
Scraping URL:  http://stores.ebay.co.uk/Subculture-Wear
Scraping success = True. Url:  http://stores.ebay.co.uk/Subculture-Wear
Scraping URL:  http://www.spintastics.com/
Scraping success = True. Url:  http://www.spintastics.com/
Scraping URL:  http://www.dvd.net.au/
Scraping success = True. Url:  http://www.dvd.net.au/
Scraping URL:  http://www.nwwoodgallery.com/
Scraping success = True. Url:  http://www.nwwoodgallery.com/
Scraping URL:  http://www.paradisecoin.com/
Scraping success = True. Url:  http://www.paradisecoin.com/
Scraping URL:  http://www.garagecabinetsonline.com/
Scraping success = True. Url:  http://www.garagecabinetsonline.com/
Scraping URL:  http://www.cvlinens.com/
Scraping success = True. Url:  http

Scraping success = True. Url:  http://idjnow.com/
Scraping URL:  http://www.fairhavenbaptist.org/
Scraping success = True. Url:  http://www.fairhavenbaptist.org/
Scraping URL:  http://leonia.church/
Scraping success = False.Url:  http://leonia.church/
HTTPSConnectionPool(host='leonia.church', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.hungersolutions.org/
Scraping success = True. Url:  http://www.hungersolutions.org/
Scraping URL:  http://calvarychapelnorristown.org/
Scraping success = True. Url:  http://calvarychapelnorristown.org/
Scraping URL:  http://www.jm3.net/
Scraping success = True. Url:  http://www.jm3.net/
Scraping URL:  http://www.goatrance.de/goabase/
Scraping success = False.Url:  http://www.goatrance.de/goabase/
HTTPSConnectionPool(host='www.goatrance.de', port=443): Max retries exceeded with url: /goabase/

Scraping success = False.Url:  http://plato.stanford.edu/entries/feminism-femhist/
HTTPSConnectionPool(host='plato.stanford.edu', port=443): Max retries exceeded with url: /entries/feminism-femhist/ (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))


Scraping URL:  http://www.armeniangenocidedebate.com/
Scraping success = True. Url:  http://www.armeniangenocidedebate.com/
Scraping URL:  http://www.gourd.com/123A.HTML
Scraping success = True. Url:  http://www.gourd.com/123A.HTML
Scraping URL:  http://www.sunsetlodge915.com/
Scraping success = True. Url:  http://www.sunsetlodge915.com/
Scraping URL:  http://www.spfbc.com/
Scraping success = True. Url:  http://www.spfbc.com/
Scraping URL:  http://www.legalstaff.com/
Scraping success = True. Url:  http://www.legalstaff.com/
Scraping URL:  http://cbctexas.org/
Scraping success = True. Url:  http://cbctexas.org/
Scraping URL:  http://www.ectorbaptist.org/

In [38]:
scraped_websites_df = pd.DataFrame(pd.concat(series_list,axis=1,sort=False)).T

## Let's join the scraped table with the dmoz website dataframe

In [39]:
scraped_websites_df = websites_df_subset.merge(scraped_websites_df,left_on="Sites",right_on="website_url")

In [41]:
scraped_websites_df.to_pickle("./pkl_files/dmoz_subset_956_sites_scraped.pkl")

In [40]:
scraped_websites_df.columns

Index(['index', 'Description', 'Title', 'URL', 'Sites', 'CCat', 'MCat', 'SCat',
       'website_url', 'website_name',
       ...
       'tag_count_mark', 'tag_count_v:shape', 'tag_count_v:imagedata',
       'tag_count_w:wrap', 'tag_count_href', 'tag_count_audio',
       'tag_count_hardpoint', 'tag_count_nolayer', 'tag_count_ilayer',
       'tag_count_align'],
      dtype='object', length=253)

### This is the final dataframe for the website.