### Importing needed libraries 

In [None]:
import pandas as pd 
from pdb import set_trace
import requests
import os 
import json
from typing import Dict
from time import sleep
import random
import bs4
from typing import List
from datetime import datetime
#pip install bs4
#pip install dictionaries

## 1. Data collection

### 1.1. Get the list of places

In [3]:
#collecting the URL of the places listed in the first 400 pages 

class urlDownloader():
    WEBSITE = "https://www.atlasobscura.com/places?page={}&sort=likes_count"

    def __init__(self,number_of_pages:int=400) -> None:
        self.number_of_pages = number_of_pages
    
    #Retriving Title
    def _retriveTitle(self,soup_tags:bs4.element.Tag) -> str:
        title =soup_tags.find("h3").text
        if title is not None:
            return title
        return ""
    
    #Retriving Link 
    def _retrieveLink(self,soup_tags:bs4.element.Tag) -> str: 
        link = soup_tags.find("a").attrs["href"]
        if link is not None:
            return link
        return ""
   
    def saver_format(self,data:List[tuple],format:str=None):
        """_summary_

        Args:
            format (str, optional): _description_. Defaults to None.
        """
        data = pd.DataFrame(data,columns=["NAME","URL"])
        data["URL"] = "https://www.atlasobscura.com" + data.URL
        if format == "csv":
            data.to_csv("datas/urls.csv",index=False)
        if format == "txt":
            with open("datas/urls.txt","w") as f:
                for idx,row in data.iterrows():
                    f.write(" ".join(row.values)+"\n")

    def download_urls(self) -> pd.DataFrame:
        data_holder:List[tuple] = [] 
    
        for page_number in range(1,self.number_of_pages+1):
            sleep(random.randint(0,2))  #to prevent blocking, because of many requests.
            data = requests.get(urlDownloader.WEBSITE.format(page_number))
            soup = bs4.BeautifulSoup(data.content,features="lxml")
            roi_places = soup.find_all("div", {"class": "col-md-4 col-sm-6 col-xs-12"})#18
            for place in roi_places: 
                place_title = self._retriveTitle(place)
                place_link= self._retrieveLink(place)
                data_holder.append((place_title,place_link))
        self.saver_format(data=data_holder,format="csv")

            
if __name__ == "__main__":
    downloader = urlDownloader(number_of_pages=400)
    downloader.download_urls()

### 1.2. Crawl places

In [None]:
class htmlDownloader():
    def __init__(self,file_path:str="datas/urls.csv") -> None:
        self.urls = self._url_reader(file_path)

    def _url_reader(self,path:str,format_file="csv") -> pd.DataFrame :
        if format_file =="txt" :
            return pd.read_csv(path,sep=" ")
        return pd.read_csv(path)
    
    #if program stops, to not lose the data that collected up to the stopping point.
    def download_htmls(self) -> None :
        config = self.last_download_control()
        roi_urls = self.urls.iloc[config["last_downloaded"]:]
        for idx,row in roi_urls.iterrows():
            try:
                print(f"Downloading {row.NAME}")
                sleep(random.randint(0,2))
                htmls = requests.get(row.URL).content
                saving_path = f"datas/htmls/{row.NAME}.html"
                with open(saving_path,"wb") as f:
                    f.write(htmls)
                config["last_downloaded"] += 1 

                if idx % 10 == 0 :
                    self.save_config_file(config)
            except Exception as e:
                print(f"ERROR {e}")
                self.save_config_file(config=config)
                    

    #if program stops, to not lose last data that collected up to the stopping point.
    def last_download_control(self) -> Dict[str,int]:

        if os.path.isfile("config.json"):
            with open("config.json","rb") as f :
                config = json.load(f)
        else:
            config = {"last_downloaded":0}
            with open("config.json","w") as f:
                json.dump(config,f)
        return config

    def save_config_file(self,config:Dict[str,int]):
        with open("config.json","w") as f:
            json.dump(config,f)

    
if __name__ == "__main__":
    downloader = htmlDownloader()
    downloader.download_htmls()

Downloading Mount Moriah Cemetery
Downloading New Orleans Historic Voodoo Museum
Downloading The Neon Museum
Downloading Museum of Vampires 
Downloading Wisteria Tunnel
Downloading Explorers Club Headquarters
Downloading Mapparium
Downloading House of Eternal Return
Downloading Sagano Bamboo Forest
Downloading The Viktor Wynd Museum of Curiosities, Fine Art & Natural History
Downloading 'Akhob'
Downloading Paint Mines Interpretive Park
Downloading Puzzlewood
Downloading Hamilton Pool
Downloading Abandoned Jazzland
Downloading Fairy Pools
Downloading 5 Beekman Street
Downloading Havasupai Falls
Downloading Rakotzbrücke Devil's Bridge
Downloading Lexington Candy Shop 
Downloading Horseshoe Bend
Downloading The Hardy Tree
Downloading Electric Ladyland: The Museum of Fluorescent Art
Downloading Cat Island
Downloading Sunken City 
Downloading Alcatraz Island
Downloading Père Lachaise Cemetery
Downloading Yayoi Kusama Firefly Infinity Mirror Room
Downloading The Witch House of Salem
Download

### 1.3 Parse downloaded pages


In [None]:
class htmlParser():
    HTML_FILES = "datas/htmls/"

    def __init__(self) -> None:
        self.html_files = os.listdir(htmlParser.HTML_FILES)
        self.dataframe = pd.read_csv("datas/urls.csv")
        self.dataframe.NAME = self.dataframe.NAME.str.strip()
        self.process_pages()


    def read_file(self,file_name:str) -> bs4.BeautifulSoup:
        file_path = htmlParser.HTML_FILES + file_name

        with open(file_path,"r") as f:
            html = f.read()
        return bs4.BeautifulSoup(html,"html.parser")


    def extract_header_information(self,soup:bs4.BeautifulSoup):
        roi_header = soup.find_all("div", {"class": "DDPage__header-container grid-row"})[0]
        self.placeName = roi_header.find("h1", {"class":"DDPage__header-title"}).text.strip()

        location = roi_header.find("div", {"class":"DDPage__header-place-location"}).text.strip()

        counters = roi_header.find_all("div",{"class":"title-md item-action-count"})
        self.numPeopleVisited,self.numPeopleWant = int(counters[0].text.strip()),int(counters[1].text.strip())
        self.placeShortDesc = roi_header.find("h3", {"class":"DDPage__header-dek"}).text.strip()
        # set_trace(header="header infos")
    
    def extract_descriptions(self,soup:bs4.BeautifulSoup):

        ## Main Descriptions 
        placeDesc = soup.find_all("div", {"class": "DDP__body-copy"})[0]
        self.placeDesc = "".join([p.text.strip() for p in placeDesc.find_all("p")])
        # set_trace(header="descriptions")
    
    def extract_sidebar(self,soup:bs4.BeautifulSoup):
        sidebar = soup.find("div", {"class":"DDPageSiderail"})
        nearby_places = sidebar.find_all("div",{"class":"DDPageSiderailRecirc__item-title"})
        positions =  sidebar.find("div",{"class":"DDPageSiderail__coordinates js-copy-coordinates"}).attrs["data-coordinates"].split(",")
        
        #9-Latitud and Longitude of the place's location,  #7-placeNearby,  8-placeAddress
        self.placeAlt, self.placeLong = float(positions[0].strip()), float(positions[1].strip())
        self.placeNearby = [place.text for place in nearby_places]
        self.placeAddress =  sidebar.find("address",{"class":"DDPageSiderail__address"}).find("div",recursive=False).get_text(" ").replace("\n","")

    def extract_footer(self,soup:bs4.BeautifulSoup):
        footer = soup.find("div", {"id":"ugc-module"})
        Editors = footer.find_all("a",{"class":"DDPContributorsList__contributor"})
        placePubDate = footer.find("div",{"class":"DDPContributor__name"}).text
        #10-placeEditors, 11-placePubDate, 2-placeTags
        self.placeEditors = [editors.find("span").text if editors.find("span") else editors.text for editors in Editors]
        self.placePubDate = datetime.strptime(placePubDate,"%B %d, %Y")
        self.placeTags = [item.get_text("").replace("\n","") for item in soup.find_all("a",{"class":"itemTags__link js-item-tags-link"})]
        # set_trace(header="sidebar")

    def extract_related_places(self,soup:bs4.BeautifulSoup):
        
        related_list = soup.find("div",{
            "class":"card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links",
            "data-gtm-template":"DDP Footer Recirc Lists"
        })
        related_places = soup.find("div",{
            "class":"card-grid CardRecircSection__card-grid js-inject-gtm-data-in-child-links",
            "data-gtm-template":"DDP Footer Recirc Related"
        })
        #12-placeRelatedLists,   #13-placeRelatedPlaces
        self.placeRelatedLists = [data.find("span").text.strip() for data in related_list.find_all("h3")]
        self.placeRelatedPlaces = [data.find("span").text.strip() for  data in related_places.find_all("h3")]
        
    def process_pages(self):
        
        for idx,file_name in enumerate(self.html_files):
            try:
                soup = self.read_file(file_name=file_name)
                self.extract_header_information(soup=soup)
                self.extract_descriptions(soup=soup)
                self.extract_sidebar(soup=soup)
                self.extract_footer(soup=soup)
                self.extract_related_places(soup=soup)
                #14-placeUrl
                self.placeUrl = self.dataframe.loc[self.dataframe.NAME == self.placeName]["URL"]
                self.save_tsv_file(idx)
            except Exception as e : 
                print(e)
                set_trace() #debuggig.
                
    def save_tsv_file(self,idx:int):
        with open(f"datas/tsv_files/place_{idx}.tsv","w") as f:
            whole_data = [
                self.placeName,str(self.placeTags),str(self.numPeopleVisited),
                str(self.numPeopleWant),self.placeDesc,self.placeShortDesc,
                str(self.placeNearby),self.placeAddress,str(self.placeAlt),
                str(self.placeLong),str(self.placeEditors),self.placePubDate.strftime("%Y-%m-%d"),
                str(self.placeRelatedLists),
                str(self.placeRelatedPlaces),self.placeUrl.values[0]]
            
            tsv = "\t".join(whole_data)
            f.write(tsv)
            


if __name__ == "__main__":
    htmlParser()


## 2. Search Engine

### 2.1. Conjunctive query

###  2.1. Conjunctive query

### 2.1.1) Create your index!

### 2.1.2) Execute the query

### 2.2) Conjunctive query & Ranking score

### 2.2.1) Inverted index

### 2.2.2) Execute the query


## 3. Define a new score!

## 4. Visualizing the most relevant places

## 5. BONUS: More complex search engine


## 6. Command line question


## 7. Theoretical question