In [28]:
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import re
import numpy as np
import pandas as pd

In [29]:
proptypes = ["flat", "apartment", "penthouse", "house", "maisonette", "farmhouse", "villa", "field"]
adID = []
adloc = []
adprice = []
adproptype = []
processedfiles = []

In [None]:
# Set up progress bar
pbar = tqdm(total=480) 
count = 0
# Looping through all 480 files
for filename in os.listdir("./data"):
    # Only consider .html files
    if filename.endswith(".html"):
    # Parse HTML into BeutifulSoup object
        html = BeautifulSoup(open("data/20150423_prop_for_sale_tom_classifieds.html"), "html.parser")
        # Get list of all <li> tags
        lis = html.find_all("li")
        for li in lis:
            # If this <li> tag is an ad listing, it will contain the string 'Property For Sale'
            if "Property For Sale" in li.text:
                # Convert all ad text to lowercase for consistency
                adtext = li.text.lower()

                # Extract 'name' attribute of each <li> tag. This will occasionally fail since
                # there exist other <li> tags without the 'name' attribute. In that case, 
                # I ignore the <li> tag and search for the next one. 
                # The 'name' attribute identifies each ad listing. Fortunately, if an 
                # previously parsed <li> tag reoccurs, the 'name' attribute can be used to 
                # recognize such repetition. This makes it easy to filter out overlapping data
                # later by calling drop_duplicates().
                try:
                    adID.append(li['name'])
                except:
                    continue

                # Search for municipality. In the <li> tag's raw text, 
                # these town names occur in a new line. Therefore, I 
                # use regex to match \n first. Any following characters
                # except \n are matched. Positive lookahead asserts that 
                # the characters are followed by a fullstop. First capturing
                # groups longer than 30 characters will be ignored
                loc = re.search("\n(.+?(?=\.))\.", adtext)
                if loc is None or len(loc.group(1)) > 30:
                    adloc.append(np.nan)
                else:
                    adloc.append(loc.group(1))

                # Search for property types matching a list of 8 predefined
                # strings. If no matches are found in the ad listing, then 
                # nan is appended to the adproptype list. It is very important
                # to always append something to the list (matched string or nan)
                # otherwise the elements in the 4 lists will not correspond to
                # the same <li> tag, and drop_duplicates() will not be effective.
                match = False
                for string in proptypes:
                    if string in adtext:
                        adproptype.append(string)
                        match = True
                        break    
                if match is False:
                    adproptype.append(np.nan)

                # Search for property price. The price consistently occurs
                # after the property description sentence(s). Therefore, I 
                # exploit this by matching any characters after a fullstop
                # which are followed by any number of digits, a comma, any
                # number of digits, and finally another fullstop.
                pri = re.search("\.(.*\d+\,\d+)\.", adtext)
                if pri is None:
                    adprice.append(np.nan)
                elif pri.group(1) is None:
                    adprice.append(np.nan)
                else:
                    pri = re.search("([\d+|\,]+$)", pri.group(1))
                    adprice.append(int(pri.group(1).replace(",", "")))

    # Data has been collected for this HTML file. Now I store it 
    # as a pandas dataframe in a list for all 280 processed files
    processedfiles.append(pd.DataFrame({'id': adID,
            'price': adprice,
            'location': adloc,
            'type': adproptype}))
    
    pbar.update(1)
pbar.close()

 87%|████████▋ | 416/480 [05:14<00:55,  1.14it/s]

In [32]:
processedfiles

[             id     price           location        type
 0       1080953       NaN                NaN         NaN
 1       1080954  152000.0             balzan   apartment
 2       1080955  185000.0  gozo, għajnsielem       house
 3       1080956  293000.0                NaN       house
 4       1080957  145000.0                NaN  maisonette
 ...         ...       ...                ...         ...
 196576  1079753  120000.0              msida   penthouse
 196577  1079754       NaN           siġġiewi  maisonette
 196578  1079755  330000.0         st julians   apartment
 196579  1079756  375000.0         st julians       house
 196580  1079757  950000.0             żebbuġ       house
 
 [196581 rows x 4 columns]]

In [37]:
data = pd.concat(processedfiles)

In [38]:
data

Unnamed: 0,id,price,location,type
0,1080953,,,
1,1080954,152000.0,balzan,apartment
2,1080955,185000.0,"gozo, għajnsielem",house
3,1080956,293000.0,,house
4,1080957,145000.0,,maisonette
...,...,...,...,...
196576,1079753,120000.0,msida,penthouse
196577,1079754,,siġġiewi,maisonette
196578,1079755,330000.0,st julians,apartment
196579,1079756,375000.0,st julians,house


In [None]:
# CODE FROM FIRST DATA COLLECTION ATTEMPT - NOT FINAL SOLUTION

scrape = [] # List to store all ads (contains duplicates)
historybuffer = []

# Set up progress bar
pbar = tqdm(total=480) 
count = 0
# Looping through all 480 files
for filename in os.listdir("./data"):
    # Only consider .html files
    if filename.endswith(".html"):
        # Parse HTML into BeutifulSoup object
        html = BeautifulSoup(open("data/" + filename), "html.parser")
        # Get list of all <h2> tags matching the class name
        h2s = html.find_all("h2", class_ = "classified_date default_top_margin")
    
        # Website layout varies through time. Some files give an empty h2s list.
        # Therefore, I cater for both eventualities, starting with the newer layout.
        if len(h2s) == 0:
            # Get list of all <li> tags
            lis = html.find_all("li")
            for li in lis: # Append all ad text to scrape list
                adtext = li.text.replace("\n", "")
                if adtext not in historybuffer:
                    scrape.append(adtext)
                    historybuffer.append(adtext)
                    
                    
        # Catering for the older website style
        else:
            # Get list of all <ul> tags
            uls = html.find_all("ul", class_ = "classified_list")
            for ul in uls: 
                # Get list of all <li> tags
                lis = ul.find_all("li")
                for li in lis: # Append all ad text to scrape list
                    adtext = li.text.replace("\n", "")
                    if adtext not in historybuffer:
                        scrape.append(adtext)
                        historybuffer.append(adtext)
                    
    pbar.update(1)
pbar.close()