# Step 1: Data

### Import libraries

In [120]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime 
import numpy as np
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import json
import collections
import itertools
import math
import heapq
import folium
from geopy import distance
#nltk.download('punkt')
#nltk.download('stopwords')
import library #library of functions

# To reaload library import
from importlib import reload
reload(library)

# To print results
BOLD = '\033[1m'
END = '\033[0m'

We only need some columns values of the Airbnb_Texas_Rentals.csv so we filter the result by:

In [2]:
# List of column values needed
cols = ['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing', 'description', 'latitude', 'longitude', 'title', 'url']
# Read the csv file
airbnbrentals = pd.read_csv('Airbnb_Texas_Rentals.csv', sep=',', encoding='utf-8', usecols=cols)

In [3]:
# Printed example of 3 rows
airbnbrentals.loc[0:2,]

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...


# Step 2: Create documents

In this step we are creating i tsv files to save all the information. 

In [4]:
# Create directory to save the documents if doesn't exist
directory = "documents"
if not os.path.exists(directory):
    os.makedirs(directory)

# Create all tsv files, each per row
nRowsOrFiles = airbnbrentals.shape[0] #this variable is the number of rows or files
for i in range(0, nRowsOrFiles): 
    #Write the tsv file
    airbnbrentals.loc[i:i,].to_csv('documents/doc_'+str(i)+'.tsv', sep='\t', encoding='utf-8', index = False)

In [5]:
# Read one tsv file to see result
pd.read_csv('documents/doc_2.tsv', sep='\t', encoding='utf-8')

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...


# Step 3: Search Engine

As a first common step, we have to preprocess the documents by:

- Removing stopwords
- Removing punctuation
- Stemming
- Anything else you think it's needed

In [6]:
# Create directory to save the documents if doesn't exist
directory = "documentsCleaned"
if not os.path.exists(directory):
    os.makedirs(directory)
# This function takes all documents and removes punctuation, stop words and do stemming
library.modifyDocs(nRowsOrFiles)

Test if it works reading one document:

In [7]:
#Example of a tsv document cleaned
pd.read_csv('documentsCleaned/doc_2.tsv', sep='\t', encoding='utf-8')

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$59,1,Houston,January 2017,river hous island close citi nA well maintain ...,29.829352,-95.081549,river hous near citi,https://www.airbnb.com/rooms/16926307?location...


## 3.1) Conjunctive query

At this moment, we are taking into account the description and title of each document. It means that the first Search Engine will evaluate queries with respect to the aforementioned information.

### 3.1.1) Create your index!

Create a inverted index of this format:

{
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
...}

Here we start creating a dic --> invertedIndex to store all the values.
Then we loop trough all documents adding docs to the inverted index using invertedIndexAdd function

In [8]:
# Start inverted index
invertedIndex = {}

# Loop all documents and make the inverted index
for i in range(0, nRowsOrFiles): 
    doc = pd.read_csv('documentsCleaned/doc_'+str(i)+'.tsv', sep='\t', encoding='utf-8')
    invertedIndex = library.invertedIndexAdd(invertedIndex, 'doc_'+str(i), doc, 'description')
    invertedIndex = library.invertedIndexAdd(invertedIndex, 'doc_'+str(i), doc, 'title')

Let's see the inverted index created:

In [9]:
# Print some(20) key and values of the inverted index of each word
print(BOLD + 'garden' + END, "=>", *invertedIndex[library.cleanString('garden')][0:20])
print("")
print(BOLD + 'expensive' + END, "=>", *invertedIndex[library.cleanString('expensive')][0:20])
print("")
print(BOLD + 'beautiful' + END, "=>", *invertedIndex[library.cleanString('beautiful')][0:20])

[1mgarden[0m => doc_1 doc_3 doc_63 doc_77 doc_111 doc_120 doc_127 doc_184 doc_184 doc_217 doc_217 doc_220 doc_300 doc_300 doc_314 doc_371 doc_396 doc_456 doc_496 doc_496

[1mexpensive[0m => doc_3795 doc_3862 doc_3960 doc_4443 doc_4588 doc_6960 doc_7314 doc_7829 doc_8664 doc_8809 doc_10897 doc_12725 doc_12811 doc_13508 doc_13546 doc_15986 doc_16850 doc_17089 doc_17238

[1mbeautiful[0m => doc_1 doc_7 doc_11 doc_13 doc_14 doc_21 doc_30 doc_34 doc_35 doc_51 doc_65 doc_68 doc_76 doc_98 doc_107 doc_108 doc_109 doc_117 doc_136 doc_138


Writting invertedIndex into inverted_index.txt

In [10]:
# Write invertedIndex to a txt file
json.dump(invertedIndex, open("inverted_index.txt",'w'))

Reading inverted_index.txt as dictionary

In [11]:
# Load invertedIndex from a txt file
invertedIndex = json.load(open("inverted_index.txt"))

### 3.1.2) Execute the query

Given a query: *a beautiful house with garden and bedroom* //with beach no found

In [12]:
query = 'a beautiful house with garden and bedroom'

We search the documents that contain the query through our invertedIndex

In [13]:
# Compute the search with the query and obtain a dictionary with the number of matches
searchedResults = library.searchQueryConjunctive(invertedIndex, query)

[1mQuery intruduced: [0ma beautiful house with garden and bedroom
[1mCleaned query: [0mbeauti hous garden bedroom


We sort the documents by frecuency of appearance to see if there are conjunctive results

In [14]:
# Sort the results
sortedResults = sorted(searchedResults.items(), key=lambda kv: -kv[1])
# Print 12 sorted results
print(BOLD + 'Sorted Documents and Matches:' + END, *sortedResults[0:12])
# Find how many cleaned words contain the query
numberOfQueryWords = len( library.cleanString(query).split() )
print(BOLD + 'Number of cleaned words:' + END, numberOfQueryWords)

[1mSorted Documents and Matches:[0m ('doc_14552', 4) ('doc_14254', 4) ('doc_2823', 4) ('doc_11603', 4) ('doc_5509', 4) ('doc_1', 4) ('doc_566', 4) ('doc_11331', 4) ('doc_6447', 4) ('doc_15035', 4) ('doc_6225', 4) ('doc_12312', 3)
[1mNumber of cleaned words:[0m 4


Look if the doc have the same coincidences as the number of query words to just show that results

In [15]:
# Compute a list of conjunctive matches containing each tsv file
listMatches = library.listOfConjunctiveMatches(searchedResults, numberOfQueryWords)
        
# If dfs_match is not empty, process the results, if not print no results
if(len(listMatches) != 0):
    print(BOLD + "CONJUCTIVE RESULTS FOUND" + END)
    # Concatenate all data into one DataFrame
    big_frame = pd.concat(listMatches, ignore_index=True)
    # Reorder columns 
    df = big_frame.loc[:, ['title', 'description', 'city', 'url']]
    # Display dataframe result of the query
    display(df)
else:
    print(BOLD + "NO CONJUCTIVE RESULTS FOUND" + END)

[1mCONJUCTIVE RESULTS FOUND[0m


Unnamed: 0,title,description,city,url
0,New!3BR Fort Worth House near AT&T Stadium w/ ...,Explore the City of Cowboys and Culture from t...,Fort Worth,https://www.airbnb.com/rooms/19193017?location...
1,Garden Bedroom,"Comfortable and private, the garden bedroom is...",Austin,https://www.airbnb.com/rooms/5096161?location=...
2,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...
3,Rooms at Rehoboth,Step back into yesterday in this old Victorian...,Temple,https://www.airbnb.com/rooms/18055961?location...
4,Beautiful queen bedroom in NW Austin,"My house is close to Lakeline Mall, highways a...",Austin,https://www.airbnb.com/rooms/16755710?location...
5,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...
6,Beautiful king bedroom in NW Austin,"My house is close to Lakeline Mall, highways a...",Austin,https://www.airbnb.com/rooms/15162593?location...
7,Rooms at Rehoboth,Step back into yesterday in this old Victorian...,Temple,https://www.airbnb.com/rooms/18055961?location...
8,New!3BR Fort Worth House near AT&T Stadium w/ ...,Explore the City of Cowboys and Culture from t...,Fort Worth,https://www.airbnb.com/rooms/19193017?location...
9,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...


## 3.2) Conjunctive query & Ranking score

In the new Search Engine, given a query, we want to get the top-k documents related to the query. In particular:

- Find all the documents that contains all the words in the query (as before...).
- Sort them by their similarity with the query
- Return in output k documents, or all the documents with non-zero similarity with the query when the results are less than k. You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.

To solve this task we are using *tfIdf score*, and the *Cosine similarity*.

### 3.2.1) Inverted index scored TF-IDF

{
term_id_1:[(document1, tfIdf_{term,document1}), (document2, tfIdf_{term,document2}), (document4, tfIdf_{term,document4}), ...],
term_id_2:[(document1, tfIdf_{term,document1}), (document3, tfIdf_{term,document3}), (document5, tfIdf_{term,document5}), (document6, tfIdf_{term,document6}), ...],
...}

In [16]:
# Start inverted index scored
invertedIndexScored = {}

# Loop all documents and make the inverted index
for i in range(0, nRowsOrFiles): 
    invertedIndexScored = library.invertedIndexScoredAdd(invertedIndexScored, 'doc_'+str(i), invertedIndex, nRowsOrFiles)

Let's see the inverted index scored created using tf-idf score

In [17]:
# Print some(10) key and values of the inverted index scored
print(BOLD + 'garden' + END, "=>", *invertedIndexScored[library.cleanString('garden')][0:10])
print("")
print(BOLD + 'expensive' + END, "=>", *invertedIndexScored[library.cleanString('expensive')][0:10])
print("")
print(BOLD + 'beautiful' + END, "=>", *invertedIndexScored[library.cleanString('beautiful')][0:10])

[1mgarden[0m => ('doc_1', 3.2670637278023134) ('doc_3', 3.2670637278023134) ('doc_63', 3.2670637278023134) ('doc_77', 3.2670637278023134) ('doc_111', 3.2670637278023134) ('doc_120', 3.2670637278023134) ('doc_127', 3.2670637278023134) ('doc_184', 6.534127455604627) ('doc_217', 6.534127455604627) ('doc_220', 3.2670637278023134)

[1mexpensive[0m => ('doc_3795', 6.8679744089702925) ('doc_3862', 6.8679744089702925) ('doc_3960', 6.8679744089702925) ('doc_4443', 6.8679744089702925) ('doc_4588', 6.8679744089702925) ('doc_6960', 6.8679744089702925) ('doc_7314', 6.8679744089702925) ('doc_7829', 6.8679744089702925) ('doc_8664', 6.8679744089702925) ('doc_8809', 6.8679744089702925)

[1mbeautiful[0m => ('doc_1', 1.6132240290586652) ('doc_7', 1.6132240290586652) ('doc_11', 1.6132240290586652) ('doc_13', 1.6132240290586652) ('doc_14', 1.6132240290586652) ('doc_21', 1.6132240290586652) ('doc_30', 1.6132240290586652) ('doc_34', 1.6132240290586652) ('doc_35', 1.6132240290586652) ('doc_51', 1.613224

Writting invertedIndexScored into inverted_index_scored.txt:

In [18]:
# Write invertedIndex to a txt file
json.dump(invertedIndexScored, open("inverted_index_scored.txt",'w'))

Reading inverted_index_scored.txt as dictionary

In [19]:
# Load invertedIndex from a txt file
# Note: changes () for [] 
# invertedIndexScored = json.load(open("inverted_index_scored.txt"))

### 3.2.2) Execute the query cosine similarity

To know which are the most similar documents according to the query we are using scoring function "Cosine Similarity" with respect to the tfIdf representations of the documents.
Given a query, that you let the user enter: a beautiful house with garden and beach
The Search Engine return a list of documents, ranked by their Cosine Similarity with respect to the query entered in input.

This is the query for the search engine:

In [20]:
queryCos = 'a beautiful house with garden and bedroom'

This function looks for the conjunctive results and looks for the documents that have conjunctive match and save the doc_id

In [21]:
conjunctiveDocId = library.getListOfConjunctiveDocIds(invertedIndex, queryCos)

[1mQuery intruduced: [0ma beautiful house with garden and bedroom
[1mCleaned query: [0mbeauti hous garden bedroom
[1mNumber of query words:[0m 4
[1mNumber of conjunctive matches:[0m 11


If there are conjunctive matches we only compute the cosineSimilarity of matches with computeCosineSim function, if not of all the results

In [22]:
# List of tuples containing doc_id and cos_simmilarity
cosSimResults = library.computeCosineSim(conjunctiveDocId, nRowsOrFiles, queryCos, invertedIndexScored)
# Print result limited to 20 values
print(BOLD + "Tuples 'doc_id' and 'cos_sim':" + END, *cosSimResults[0:20])

[1mTuples 'doc_id' and 'cos_sim':[0m (0.45691675698941003, 'doc_14552') (0.33258454004986665, 'doc_14254') (0.364744247752949, 'doc_2823') (0.4301237939159375, 'doc_11603') (0.4268023249509337, 'doc_5509') (0.364744247752949, 'doc_1') (0.4268023249509337, 'doc_566') (0.4301237939159375, 'doc_11331') (0.45691675698941003, 'doc_6447') (0.364744247752949, 'doc_15035') (0.6499198728162784, 'doc_6225')


Ordering results with heap algorithm

In [23]:
# Order the data by cosine similarity
sortedCosSim = heapq.nlargest(len(cosSimResults), cosSimResults)
#sortedCosSim = sorted(cosSimResults, key=lambda kv:-kv[1])
# Print result sorted limited to 20 values
print(BOLD + "Ordered tuples 'doc_id' and 'cos_sim':" + END, *sortedCosSim[0:20])

[1mOrdered tuples 'doc_id' and 'cos_sim':[0m (0.6499198728162784, 'doc_6225') (0.45691675698941003, 'doc_6447') (0.45691675698941003, 'doc_14552') (0.4301237939159375, 'doc_11603') (0.4301237939159375, 'doc_11331') (0.4268023249509337, 'doc_566') (0.4268023249509337, 'doc_5509') (0.364744247752949, 'doc_2823') (0.364744247752949, 'doc_15035') (0.364744247752949, 'doc_1') (0.33258454004986665, 'doc_14254')


After that, we look for the data to create a dataframe and show the results of the search with computeAndDisplayCosineSimilarityDataFrame function:

In [24]:
library.makeAndDisplayCosineSimilarityDataFrame(sortedCosSim, conjunctiveDocId) 

[1mCONJUNCTIVE RESULTS[0m


Unnamed: 0,title,description,city,url,similarity
0,"Home Away from Home in Spring, TX",Beautiful 2100 sq. ft house! Three bedroom and...,Spring,https://www.airbnb.com/rooms/1521227?location=...,0.64992
1,New!3BR Fort Worth House near AT&T Stadium w/ ...,Explore the City of Cowboys and Culture from t...,Fort Worth,https://www.airbnb.com/rooms/19193017?location...,0.456917
2,New!3BR Fort Worth House near AT&T Stadium w/ ...,Explore the City of Cowboys and Culture from t...,Fort Worth,https://www.airbnb.com/rooms/19193017?location...,0.456917
3,Rooms at Rehoboth,Step back into yesterday in this old Victorian...,Temple,https://www.airbnb.com/rooms/18055961?location...,0.430124
4,Rooms at Rehoboth,Step back into yesterday in this old Victorian...,Temple,https://www.airbnb.com/rooms/18055961?location...,0.430124
5,Beautiful king bedroom in NW Austin,"My house is close to Lakeline Mall, highways a...",Austin,https://www.airbnb.com/rooms/15162593?location...,0.426802
6,Beautiful queen bedroom in NW Austin,"My house is close to Lakeline Mall, highways a...",Austin,https://www.airbnb.com/rooms/16755710?location...,0.426802
7,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...,0.364744
8,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...,0.364744
9,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...,0.364744


# Step 4: Define a new score!

As we have seen, we have all this data for each house:

- *average_price_per_night, bedrooms_count, city, date_of_listing, description, latitude, longitude, title, url*

We are going to take into account this values: *average_price_per_night, bedroom_count, city* to make the new score

We start from the results obtained in the step 3.1 that are:

In [25]:
queryNew = 'a beautiful house with garden and bedroom'

In [26]:
conjunctiveDocId = library.getListOfConjunctiveDocIds(invertedIndex, queryNew)

[1mQuery intruduced: [0ma beautiful house with garden and bedroom
[1mCleaned query: [0mbeauti hous garden bedroom
[1mNumber of query words:[0m 4
[1mNumber of conjunctive matches:[0m 11


In [27]:
df = library.returnAndShowDatasetConjunctiveResults(conjunctiveDocId)

[1mCONJUNCTIVE RESULTS[0m


Unnamed: 0,average_rate_per_night,bedrooms_count,city,description,title,url,doc_id
0,245,3,Fort Worth,Explore the City of Cowboys and Culture from t...,New!3BR Fort Worth House near AT&T Stadium w/ ...,https://www.airbnb.com/rooms/19193017?location...,doc_14552
1,45,1,Austin,"Comfortable and private, the garden bedroom is...",Garden Bedroom,https://www.airbnb.com/rooms/5096161?location=...,doc_14254
2,149,4,San Antonio,"Stylish, fully remodeled home in upscale NW – ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...,doc_2823
3,50,1,Temple,Step back into yesterday in this old Victorian...,Rooms at Rehoboth,https://www.airbnb.com/rooms/18055961?location...,doc_11603
4,32,1,Austin,"My house is close to Lakeline Mall, highways a...",Beautiful queen bedroom in NW Austin,https://www.airbnb.com/rooms/16755710?location...,doc_5509
5,149,4,San Antonio,"Stylish, fully remodeled home in upscale NW – ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...,doc_1
6,37,1,Austin,"My house is close to Lakeline Mall, highways a...",Beautiful king bedroom in NW Austin,https://www.airbnb.com/rooms/15162593?location...,doc_566
7,50,1,Temple,Step back into yesterday in this old Victorian...,Rooms at Rehoboth,https://www.airbnb.com/rooms/18055961?location...,doc_11331
8,245,3,Fort Worth,Explore the City of Cowboys and Culture from t...,New!3BR Fort Worth House near AT&T Stadium w/ ...,https://www.airbnb.com/rooms/19193017?location...,doc_6447
9,149,4,San Antonio,"Stylish, fully remodeled home in upscale NW – ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...,doc_15035


Here we compute the normalized score of the number of bedroom, the average prize per night and if the city appears in the query

In [28]:
normalizedBedrooms = library.dicNormalized(conjunctiveDocId, df, "bedrooms_count")
normalizedPrices = library.dicNormalized(conjunctiveDocId, df, "average_rate_per_night")
normalizedCityQuery = library.dicMatchCityQuery(conjunctiveDocId, queryNew, df)

Let's print the results of the normalized scores of each category:

In [29]:
print(BOLD + "Nomalized values of bedrooms" + END, *normalizedBedrooms.items())
print(BOLD + "Nomalized values of prices" + END, *normalizedPrices.items())
print(BOLD + "Coincidence query city" + END, *normalizedCityQuery.items())

[1mNomalized values of bedrooms[0m ('doc_14552', -0.75) ('doc_14254', -0.25) ('doc_2823', -1.0) ('doc_11603', -0.25) ('doc_5509', -0.25) ('doc_1', -1.0) ('doc_566', -0.25) ('doc_11331', -0.25) ('doc_6447', -0.75) ('doc_15035', -1.0) ('doc_6225', -0.75)
[1mNomalized values of prices[0m ('doc_14552', -1.0) ('doc_14254', -0.1836734693877551) ('doc_2823', -0.6081632653061224) ('doc_11603', -0.20408163265306123) ('doc_5509', -0.1306122448979592) ('doc_1', -0.6081632653061224) ('doc_566', -0.1510204081632653) ('doc_11331', -0.20408163265306123) ('doc_6447', -1.0) ('doc_15035', -0.6081632653061224) ('doc_6225', -0.689795918367347)
[1mCoincidence query city[0m ('doc_14552', 0) ('doc_14254', 0) ('doc_2823', 0) ('doc_11603', 0) ('doc_5509', 0) ('doc_1', 0) ('doc_566', 0) ('doc_11331', 0) ('doc_6447', 0) ('doc_15035', 0) ('doc_6225', 0)


We compute the summatory of the scores computed before of each doc:

In [30]:
scoredNewDocs = library.listOfComputeScores(conjunctiveDocId, normalizedBedrooms, normalizedCityQuery, normalizedPrices)
print(*scoredNewDocs)

(-1.75, 'doc_14552') (-0.4336734693877551, 'doc_14254') (-1.6081632653061224, 'doc_2823') (-0.45408163265306123, 'doc_11603') (-0.3806122448979592, 'doc_5509') (-1.6081632653061224, 'doc_1') (-0.40102040816326534, 'doc_566') (-0.45408163265306123, 'doc_11331') (-1.75, 'doc_6447') (-1.6081632653061224, 'doc_15035') (-1.439795918367347, 'doc_6225')


Sort the scores with heap algorithm of each doc:

In [31]:
# Order the data by cosine similarity
sortedScoredNewDocs = heapq.nlargest(len(scoredNewDocs), scoredNewDocs)
print(*sortedScoredNewDocs)

(-0.3806122448979592, 'doc_5509') (-0.40102040816326534, 'doc_566') (-0.4336734693877551, 'doc_14254') (-0.45408163265306123, 'doc_11603') (-0.45408163265306123, 'doc_11331') (-1.439795918367347, 'doc_6225') (-1.6081632653061224, 'doc_2823') (-1.6081632653061224, 'doc_15035') (-1.6081632653061224, 'doc_1') (-1.75, 'doc_6447') (-1.75, 'doc_14552')


Print the results. We have scored negative if has many bedrooms and if it is expensive and positive if the query contains the city:

In [32]:
df = library.returnAndShowDatasetResultsOwnScore(sortedScoredNewDocs)

Unnamed: 0,city,description,title,url
0,Austin,"My house is close to Lakeline Mall, highways a...",Beautiful queen bedroom in NW Austin,https://www.airbnb.com/rooms/16755710?location...
1,Austin,"My house is close to Lakeline Mall, highways a...",Beautiful king bedroom in NW Austin,https://www.airbnb.com/rooms/15162593?location...
2,Austin,"Comfortable and private, the garden bedroom is...",Garden Bedroom,https://www.airbnb.com/rooms/5096161?location=...
3,Temple,Step back into yesterday in this old Victorian...,Rooms at Rehoboth,https://www.airbnb.com/rooms/18055961?location...
4,Temple,Step back into yesterday in this old Victorian...,Rooms at Rehoboth,https://www.airbnb.com/rooms/18055961?location...
5,Spring,Beautiful 2100 sq. ft house! Three bedroom and...,"Home Away from Home in Spring, TX",https://www.airbnb.com/rooms/1521227?location=...
6,San Antonio,"Stylish, fully remodeled home in upscale NW – ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
7,San Antonio,"Stylish, fully remodeled home in upscale NW – ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
8,San Antonio,"Stylish, fully remodeled home in upscale NW – ...",Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
9,Fort Worth,Explore the City of Cowboys and Culture from t...,New!3BR Fort Worth House near AT&T Stadium w/ ...,https://www.airbnb.com/rooms/19193017?location...


# Bonus Step


In this step we are going to plot the map of Texas and the houses inside a range. At first, we have to fill the latitude and longitude of the map that we want to print. After that fill radio value to filter the search arround that point.

In [155]:
#Input values:
latitude = 29.8
longitude = -95.0
radio = 15000 #meters

Here we are creating usefull variables for our code:

In [156]:
# Aux variables:
nRowsOrFiles = airbnbrentals.shape[0] #Number of files
startPoint = (latitude, longitude)

At this point we want to plot a map starting from the latitude and longitude given before:

In [157]:
# Draw map from coordinates given
m = folium.Map(location=[latitude, longitude], zoom_start=11, tiles='Stamen Terrain')

# To add a circle to the map
folium.Circle(location=[latitude, longitude], radius=radio, color='#3186cc', fill=True, fill_color='#3186cc').add_to(m)




Now we need to find all the houses that are inside the radio given. So we are using this function to calculate distances and add to a list all the doc_id that are inside:

In [158]:
#List of doc_id of houses that are inside the radio
docIdHousesInRadio = library.serchDocIdHousesInRadio(nRowsOrFiles, startPoint, radio)

Finally we are adding the doc_id obtained before to the map using this function:

In [159]:
#Add houses to map        
library.addHouseToMap(docIdHousesInRadio, m)

Show map:

In [160]:
m