# Step 1: Data

### Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime 
import numpy as np
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import json
import collections
import itertools
#nltk.download('punkt')
#nltk.download('stopwords')
import library #library of functions

# To reaload library import
from importlib import reload
reload(library)

# To print results
BOLD = '\033[1m'
END = '\033[0m'

We only need some columns values of the Airbnb_Texas_Rentals.csv so we filter the result by:

In [2]:
# List of column values needed
cols = ['average_rate_per_night', 'bedrooms_count', 'city', 'date_of_listing', 'description', 'latitude', 'longitude', 'title', 'url']
# Read the csv file
airbnbrentals = pd.read_csv('Airbnb_Texas_Rentals.csv', sep=',', encoding='utf-8', usecols=cols)

In [3]:
# Printed example of 3 rows
airbnbrentals.loc[0:2,]

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...


# Step 2: Create documents

In this step we are creating i tsv files to save all the information. 

In [4]:
# Create directory to save the documents if doesn't exist
directory = "documents"
if not os.path.exists(directory):
    os.makedirs(directory)

# Create all tsv files, each per row
nRowsOrFiles = 100 #airbnbrentals.shape[0] #this variable is the number of rows or files
for i in range(0, nRowsOrFiles): 
    #Write the tsv file
    airbnbrentals.loc[i:i,].to_csv('documents/doc_'+str(i)+'.tsv', sep='\t', encoding='utf-8', index = False)

In [5]:
# Read one tsv file to see result
pd.read_csv('documents/doc_2.tsv', sep='\t', encoding='utf-8')

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...


# Step 3: Search Engine

As a first common step, we have to preprocess the documents by:

- Removing stopwords
- Removing punctuation
- Stemming
- Anything else you think it's needed

In [6]:
# Create directory to save the documents if doesn't exist
directory = "documentsCleaned"
if not os.path.exists(directory):
    os.makedirs(directory)
# This function takes all documents and removes punctuation, stop words and do stemming
library.modifyDocs(nRowsOrFiles)

Test if it works reading one document:

In [7]:
#Example of a tsv document cleaned
pd.read_csv('documents/doc_2.tsv', sep='\t', encoding='utf-8')

Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...


## 3.1) Conjunctive query

At this moment, we are taking into account the description and title of each document. It means that the first Search Engine will evaluate queries with respect to the aforementioned information.

### 3.1.1) Create your index!

Create a inverted index of this format:

{
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
...}

Here we start creating a dic --> invertedIndex to store all the values.
Then we loop trough all documents adding docs to the inverted index using inverted_index_add function

In [8]:
# Start inverted index
invertedIndex = {}

# Loop all documents and make the inverted index
for i in range(0, nRowsOrFiles): 
    
    doc = pd.read_csv('documentsCleaned/doc_'+str(i)+'.tsv', sep='\t', encoding='utf-8')
    invertedIndex = library.inverted_index_add(invertedIndex, 'doc_'+str(i), doc, 'description')
    invertedIndex = library.inverted_index_add(invertedIndex, 'doc_'+str(i), doc, 'title')

Let's see the inverted index created:

In [9]:
# Print some key and values of the inverted index

print(BOLD + 'hous' + END, "=>", ", ".join(invertedIndex['hous']))
print(BOLD + 'beautiful' + END, "=>", ", ".join(invertedIndex['beauti']))

[1mhous[0m => doc_1, doc_2, doc_2, doc_2, doc_7, doc_13, doc_15, doc_20, doc_24, doc_24, doc_35, doc_44, doc_46, doc_46, doc_46, doc_52, doc_52, doc_56, doc_64, doc_68, doc_71, doc_79, doc_84, doc_85, doc_85, doc_85, doc_90, doc_97, doc_98, doc_98
[1mbeautiful[0m => doc_1, doc_7, doc_11, doc_13, doc_14, doc_21, doc_30, doc_34, doc_35, doc_51, doc_65, doc_68, doc_76, doc_98


Writting invertedIndex into inverted_index.txt

In [10]:
# Write invertedIndex to a txt file
json.dump(invertedIndex, open("inverted_index.txt",'w'))

Reading inverted_index.txt as dictionary

In [11]:
# Load invertedIndex from a txt file
invertedIndex = json.load(open("inverted_index.txt"))

### 3.1.2) Execute the query

Given a query: *a beautiful house with garden and beach*

In [12]:
query = 'house location'

We search the documents that contain the query through our invertedIndex

In [13]:
searchedResults = library.searchQueryConjunctive(invertedIndex, query)

[1mQuery intruduced: [0mhouse location
[1mCleaned query: [0mhous locat


We short the documents by frecuency of appearance

In [14]:
sortedResults = sorted(searchedResults.items(), key=lambda kv: -kv[1])
# Print 5 shorted results
print(BOLD + 'Doc_id and number of coincidences: ' + END + str(sortedResults[0:5]))
# Find how many cleaned words contain the query
numberOfQueryWords = library.cleanString(query)
print(BOLD + 'Number of cleaned words: ' + END + str(len(numberOfQueryWords)))

[1mDoc_id and number of coincidences: [0m[('doc_24', 2), ('doc_15', 2), ('doc_56', 2), ('doc_90', 2), ('doc_68', 2)]
[1mNumber of cleaned words: [0m2


In [15]:
# List of pandas df
dfs = []
# Loop through all the sorted resulst and see if the number of coincidences is equal to the number of words of the query
for i in range(0, len(sortedResults)):
    if sortedResults[i][1] == len(numberOfQueryWords):
        dfs.append(pd.read_csv('documents/doc_'+str(i)+'.tsv', sep='\t', encoding='utf-8', usecols=['title', 'description', 'city', 'url']))
# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)
# Reorder columns 
df = big_frame.loc[:, ['title', 'description', 'city', 'url']]
# Print dataframe result of the query
df

Unnamed: 0,title,description,city,url
0,2 Private rooms/bathroom 10min from IAH airport,Welcome to stay in private room with queen bed...,Humble,https://www.airbnb.com/rooms/18520444?location...
1,Unique Location! Alamo Heights - Designer Insp...,"Stylish, fully remodeled home in upscale NW – ...",San Antonio,https://www.airbnb.com/rooms/17481455?location...
2,River house near the city,'River house on island close to the city' \nA ...,Houston,https://www.airbnb.com/rooms/16926307?location...
3,Private Room Close to Campus,Private bedroom in a cute little home situated...,Bryan,https://www.airbnb.com/rooms/11839729?location...
4,The Porch,Welcome to our original 1920's home. We recent...,Fort Worth,https://www.airbnb.com/rooms/17325114?location...
5,Gorgeous Home with Country Setting,"My place is close to Lake Conroe, family-frien...",Conroe,https://www.airbnb.com/rooms/14466133?location...
6,Cozy and Quaint Country Retreat with Acreage.,Rustic country retreat on 8 acres southeast of...,Cedar Creek,https://www.airbnb.com/rooms/12491762?location...
7,Friendly Private Room in َQuiet Neighborhood,This is a beautiful bedroom with a queen size ...,Fort Worth,https://www.airbnb.com/rooms/18977363?location...
8,608 - Bayfront Condos,First class and comfortable condo with the bes...,Rockport,https://www.airbnb.com/rooms/17559848?location...
9,Cozy Historic Private Studio,Private entrance to your own \,San Antonio,https://www.airbnb.com/rooms/1588127?location=...
