In [204]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import os
import numpy as np
import pandas as pd
import math

In [205]:
processedfiles = []
adID = []
adloc = []
adproptype = []
adprice = []
proptypes = ["flat", "apartment", "penthouse", "house", "maisonette", "farmhouse", "villa", "field"]

In [206]:
# Set up progress bar
pbar = tqdm(total=480) 
# Looping through all 480 files
for filename in os.listdir("./data"):
    # Only consider .html files
    if filename.endswith(".html"):
    # Parse HTML into BeutifulSoup object
        html = BeautifulSoup(open("data/" + filename), "html.parser")
        # Get list of all <li> tags
        lis = html.find_all("li")
        for li in lis:
            # If this <li> tag is an ad listing, it will contain the string 'Property For Sale'
            if "Property For Sale" in li.text:
                # Convert all ad text to lowercase for consistency
                adtext = li.text.lower()

                # Extract 'name' attribute of each <li> tag. This will occasionally fail since
                # there exist other <li> tags without the 'name' attribute. In that case, 
                # I ignore the <li> tag and search for the next one. 
                # The 'name' attribute identifies each ad listing. Fortunately, if an 
                # previously parsed <li> tag reoccurs, the 'name' attribute can be used to 
                # recognize such repetition. This makes it easy to filter out overlapping data
                # later by calling drop_duplicates().
                try:
                    adID.append(li['name'])
                except:
                    continue

                # Search for municipality. In the <li> tag's raw text, 
                # these town names occur in a new line. Therefore, I 
                # use regex to match \n first. Any following characters
                # except \n are matched. Positive lookahead asserts that 
                # the characters are followed by a fullstop. First capturing
                # groups longer than 30 characters will be ignored
                loc = re.search("\n(.+?(?=\.))\.", adtext)
                if loc is None or len(loc.group(1)) > 30:
                    adloc.append("na")
                else:
                    adloc.append(loc.group(1))

                # Search for property types matching a list of 8 predefined
                # strings. If no matches are found in the ad listing, then 
                # nan is appended to the adproptype list. It is very important
                # to always append something to the list (matched string or nan)
                # otherwise the elements in the 4 lists will not correspond to
                # the same <li> tag, and drop_duplicates() will not be effective.
                match = False
                for string in proptypes:
                    if string in adtext:
                        adproptype.append(string)
                        match = True
                        break    
                if match is False:
                    adproptype.append("na")

                # Search for property price. The price consistently occurs
                # after the property description sentence(s). Therefore, I 
                # exploit this by matching any characters after a fullstop
                # which are followed by any number of digits, a comma, any
                # number of digits, and finally another fullstop.
                pri = re.search("\.(.*\d+\,\d+)\.", adtext)
                if pri is None:
                    adprice.append("na")
                elif pri.group(1) is None:
                    adprice.append("na")
                else:
                    pri = re.search("([\d+|\,]+$)", pri.group(1))
                    adprice.append(int(pri.group(1).replace(",", "")))

        # Data has been collected for this HTML file. Now I store it 
        # as a pandas dataframe in a list for all 280 processed files
        processedfiles.append(pd.DataFrame({'id': adID,
                'price': adprice,
                'location': adloc,
                'type': adproptype}))
        adID = []
        adprice = []
        adloc = []
        adproptype = []
    
    pbar.update(1)
pbar.close()

  4%|▍         | 21/480 [00:06<02:17,  3.34it/s]
100%|██████████| 480/480 [01:21<00:00,  5.90it/s]


In [231]:
# Ensure all HTML files have been processed. 
# The list length should be 240, the number of HTML files in /data.
len(processedfiles)

240

In [232]:
# Concatenate the list of dataframes into one large dataframe.
data = pd.concat(processedfiles)
data

Unnamed: 0,id,price,location,type
0,1101104,na,100% upmarket properties,na
1,1101105,500000,"bungalow converted, with land",na
2,1101106,105000,"gozo, marsalforn",apartment
3,1101107,220000,"gozo, xewkija",na
4,1101108,260000,guardamangia / pietà,flat
...,...,...,...,...
725,1169458,na,na,na
726,1169459,875000,buskett,na
727,1169460,150000,pietà,na
728,1169461,na,"st julians, just off spinola",apartment


In [233]:
# Get rid of any duplicate records
data = data.drop_duplicates(subset=['id'])
data

Unnamed: 0,id,price,location,type
0,1101104,na,100% upmarket properties,na
1,1101105,500000,"bungalow converted, with land",na
2,1101106,105000,"gozo, marsalforn",apartment
3,1101107,220000,"gozo, xewkija",na
4,1101108,260000,guardamangia / pietà,flat
...,...,...,...,...
725,1169458,na,na,na
726,1169459,875000,buskett,na
727,1169460,150000,pietà,na
728,1169461,na,"st julians, just off spinola",apartment


In [234]:
# Function for snapping strings containing location keywords to slightly more generalized 
# locations. If no keyword is matched, location is set to nan

pbar = tqdm(total=1) 
def snaptolocation(df):
    templist = []    
    for i in range(len(df.location)):    

        cmp = str(df.location.iloc[i]) # Convert iterable to comparable string
            
        if "attard" in cmp:
            templist.append("attard")
        elif "bahar" in cmp or "baħar" in cmp:
            templist.append("baħar iċ-ċagħaq")
        elif "baħrija" in cmp:
            templist.append("baħrija")
        elif "balzvalan" in cmp:
            templist.append("balzan")
        elif "birgu" in cmp or "vittoriosa" in cmp:
            templist.append("birgu")
        elif "birkirkara" in cmp:
            templist.append("birkirkara")
        elif "birzebuġġia" in cmp or "birżebbuġa" in cmp or "qajjenza" in cmp:
            templist.append("birżebbuġa")
        elif "buġibba" in cmp:
            templist.append("buġibba")
        elif "cospicua" in cmp or "bormla" in cmp:
            templist.append("bormla")
        elif "gozo" in cmp:
            templist.append("gozo")
        elif "guardamangia" in cmp or "pieta" in cmp or "pietà" in cmp:
            templist.append("pietà")
        elif "għarghur" in cmp:
            templist.append("għarghur")
        elif "gżira" in cmp or "gzira" in cmp:
            templist.append("gżira")
        elif "hamrun" in cmp:
            templist.append("hamrun")
        elif "ibrvalag" in cmp or "ibraġ" in cmp:
            templist.append("ibraġ")
        elif "isla" in cmp or "senglea" in cmp:
            templist.append("isla")
        elif "lija" in cmp:
            templist.append("lija")
        elif "manikata" in cmp or "mellieha" in cmp or "mellieħa" in cmp:
            templist.append("mellieħa")
        elif "mosta" in cmp:
            templist.append("mostacmp")
        elif "msida" in cmp:
            templist.append("msida")
        elif "naxxar" in cmp:
            templist.append("naxxar")
        elif "qawra" in cmp:
            templist.append("qawra")
        elif "qormi" in cmp:
            templist.append("qormi")
        elif "sliema" in cmp:
            templist.append("sliema")
        elif "st julians" in cmp or "saint julians" in cmp:
            templist.append("st julians")
        elif "st lucia" in cmp or "santa lucia" in cmp:
            templist.append("st lucia")
        elif "st paul" in cmp or "saint paul" in cmp:
            templist.append("st paul's bay")
        elif "venera" in cmp:
            templist.append("st venera")
        elif "swatar" in cmp:
            templist.append("swatar")
        elif "xbievalx" in cmp:
            templist.append("ta' xbiex")
        elif "valletta" in cmp:
            templist.append("valletta")
        elif "xemxija" in cmp:
            templist.append("xemxija")
        elif "żabbar" in cmp or "zabbar" in cmp:
            templist.append("żabbar")
        elif "żebbug" in cmp:
            templist.append("żebbug")
        else:
            templist.append("na")
        pbar.update(1)
    pbar.close()
    df.insert(3, "newlocs", templist)
    return df

  0%|          | 0/1 [00:00<?, ?it/s]

In [235]:
data = snaptolocation(data)

109132it [00:05, 18972.27it/s]               


In [236]:
len(data.location)

109132

In [237]:
del data['location']
data.rename(columns = {"newlocs": "location"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [238]:
data

Unnamed: 0,id,price,location,type
0,1101104,na,na,na
1,1101105,500000,na,na
2,1101106,105000,gozo,apartment
3,1101107,220000,gozo,na
4,1101108,260000,pietà,flat
...,...,...,...,...
725,1169458,na,na,na
726,1169459,875000,na,na
727,1169460,150000,pietà,na
728,1169461,na,st julians,apartment


In [243]:
count = 0
for loc in data.location:
    if str(loc) == "isla":
        count += 1
print(count)

582


In [None]:
# CODE FROM FIRST DATA COLLECTION ATTEMPT - NOT FINAL SOLUTION

scrape = [] # List to store all ads (contains duplicates)
historybuffer = []

# Set up progress bar
pbar = tqdm(total=480) 
count = 0
# Looping through all 480 files
for filename in os.listdir("./data"):
    # Only consider .html files
    if filename.endswith(".html"):
        # Parse HTML into BeutifulSoup object
        html = BeautifulSoup(open("data/" + filename), "html.parser")
        # Get list of all <h2> tags matching the class name
        h2s = html.find_all("h2", class_ = "classified_date default_top_margin")
    
        # Website layout varies through time. Some files give an empty h2s list.
        # Therefore, I cater for both eventualities, starting with the newer layout.
        if len(h2s) == 0:
            # Get list of all <li> tags
            lis = html.find_all("li")
            for li in lis: # Append all ad text to scrape list
                adtext = li.text.replace("\n", "")
                if adtext not in historybuffer:
                    scrape.append(adtext)
                    historybuffer.append(adtext)
                    
                    
        # Catering for the older website style
        else:
            # Get list of all <ul> tags
            uls = html.find_all("ul", class_ = "classified_list")
            for ul in uls: 
                # Get list of all <li> tags
                lis = ul.find_all("li")
                for li in lis: # Append all ad text to scrape list
                    adtext = li.text.replace("\n", "")
                    if adtext not in historybuffer:
                        scrape.append(adtext)
                        historybuffer.append(adtext)
                    
    pbar.update(1)
pbar.close()