# Homework 3 - Find the perfect place to stay in Texas!

In [1]:
import csv
import pandas as pd
from os import listdir
from os.path import isfile, join
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import sent_tokenize, word_tokenize
import math
import numpy as np
import scipy
import heapq
from tqdm import tqdm

In [2]:
with open('Airbnb_Texas_Rentals.csv','r',encoding="utf8") as csv_file: #opening the csv file in read mode
    csv_reader=csv.reader(csv_file)  #csv reader allows us to iterate through its lines
    
    with open('Airbnb_Texas_Rentals.tsv','w',encoding="utf8") as new_file:  #opening a new tsv file (still empty)
        csv_writer=csv.writer(new_file,delimiter='\t')  #writer allows us to write rows, and we separate the fields in these
                                                        #rows with the tab delimiter
        for line in csv_reader: #we iterate through the reader's lines 
               csv_writer.writerow(line[1:]) #write those lines (except the first element that is the index) on the new tsv

with open('Airbnb_Texas_Rentals.tsv','r',encoding="utf8") as all_files: #now we want all the documents so we first open the big tsv
    for i, row in enumerate(all_files):  #iterating through the lanes and keeping track of the order of the operation
        if i > 0:   #skipping the first line (the header)
            if row != '\n': #the rows were separated by blank spaces in the form of '\n' so we skip them
                with open('doc\doc_' + str(int((i-(i/2)))) + '.tsv','w',encoding="utf8") as new_files: #opening a new tsv for every line, (the str(int((i-(i/2))) is to keep the number of the docs 0,1,2 ecc instead of 0, 2, 4...)
                    new_files.write(row) #writing

In [3]:
def save_dict_to_file(dic, file): #defining a function to save files right away
    f = open('{}.txt'.format(file), 'w',encoding="utf8") #open a new file in write mode (empty)
    f.write(str(dic)) #write in it what we need (it will always be a dictionary, hence 'dic')
    f.close() #closing it

In [5]:
docpaths = [f for f in listdir("doc") if isfile(join("doc", f))] #joining the paths for the tsv documents in the directory

documentlist = {} #this will be for keeping track of the words in the docs {doc_i : ['house, 'garden'.....]}
vocabulary = {} #this will be useful to keep track of the word and its id {'house':1, 'garden':2}
inverted_index = {} #this is the inv idx: {1: [doc_1, doc_2, ....], 2: [doc_2, doc_6, ....]}
indexOfWord = 0 #this is used to give the id to the words in the vocabulary
for doc in docpaths: #for every document in the list of documents...
    with open('doc\{}'.format(doc), 'r',encoding="utf8") as file: #open the doc in read mode
        lines = file.readlines() #read its lines
        if len(lines) > 0 : #leaving out all the empty lines
            document = lines[0] #its a list of only one element, so we take that as a string
            cols = document.replace('\\n',' ').split('\t') #cleaning
            
            to_tokenize = cols[4] + cols[7] #taking description and title to tokenize
            tokens = nltk.word_tokenize(to_tokenize) #tokenization
            filtered_words = [nltk.stem.PorterStemmer().stem(word) for word in tokens if word not in nltk.corpus.stopwords.words('english') and word not in ".,'()"]
            # removing stopwords, special characters, stemming
            documentlist[doc] = filtered_words #creating a document list: for every document we will have the tokenized words in title and description
            
            for w in filtered_words: 
                if w not in vocabulary: #creating the vocabulary
                    indexOfWord += 1    #assigning an index that ranges from 1 to whatever
                    vocabulary[w] = indexOfWord  #for every index we give the word
                
                if vocabulary[w] not in inverted_index: #creating the inverted index
                    temp = []  #creating a list so we can append the documents that have that word
                    temp.append(doc)  #appending the first doc
                    inverted_index[vocabulary[w]] = temp #creating the index key and giving them the value
                elif doc not in inverted_index[vocabulary[w]]: #if the key was already created
                    inverted_index[vocabulary[w]].append(doc)  

#saving the files
save_dict_to_file(inverted_index,"inverted_index")
save_dict_to_file(vocabulary,"vocabulary")
save_dict_to_file(documentlist,"documentlist")

100%|████████████████████████████████████████████████████████████████████████████| 18259/18259 [07:32<00:00, 40.39it/s]


In [7]:
len(docpaths)

18259

In [24]:
query = input() #taking the user query
tokens = nltk.word_tokenize(query) #tokenizing the query
query_tokens = [nltk.stem.PorterStemmer().stem(token) for token in tokens if token not in stopwords.words('english') if token not in ".,'()"]
#removing stopwords, special characters, stemming

vocabulary = open('vocabulary.txt', 'r', encoding = 'utf-8') #opening the vocabulary in read mode
vocabulary = eval(vocabulary.read()) #if we dont do eval the file will be considered a string. eval 'understands' what type of object it is

inverted_index = open('inverted_index.txt', 'r', encoding = 'utf-8')  #same here
inverted_index = eval(inverted_index.read())
        
term_ids = [vocabulary[token] for token in query_tokens if token in vocabulary] #taking the ids of the query tokenized words
search_results = [inverted_index[i] for i in term_ids if i in inverted_index] #taking the documents in the inv idx for that id

new_list = [set(list_) for list_ in search_results] #removing duplicates and preparing for intersection
intersect = set.intersection(*new_list) #gives us the documents that have all the words of the query

col_names = ["average_rate_per_night","bedrooms_count","city","date_of_listing","description","latitude","longitude","title","url"]
#making a list for the column names
df = pd.DataFrame(columns = col_names) #creating an empty df with the list cerated before

for j, doc in enumerate(intersect): #iterating through the intersection list and keeping track of the order
    with open(r'doc\{}'.format(doc), 'r',encoding="utf-8") as file: #opening the files in read mode
        csv_reader = csv.reader(file, delimiter = '\t') #csv.reader to iterate
        
        for line in csv_reader: #iterating through the lines. 1 line = the whole document
            for i, field in enumerate(line): #field = tab separated field
                df.loc[j, col_names[i]] = field #giving to [row j ,col i] the value of the field
        
pd.set_option('max_colwidth',500) #making the rows more readabale(bigger)
display(df[['title', 'description', 'city', 'url']].style.set_table_styles([ {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}]))
#displaying the df without the index

roof dog


Unnamed: 0,title,description,city,url
0,Cozy furnished home on the EastSide!,"Welcome to our home!\n\nAn eclectic combination of hip and modern all under one roof.\n\nCool Amenities like Retro Lounge, Super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations, yoga studio, and fitness center.\n\nThe Metro Train is connected to the apartment building and will take you directly to downtown in 8 minutes or less depending on your drop off location. So convenient, no need for a car or anything!",Austin,https://www.airbnb.com/rooms/18795604?location=Cedar%20Park%2C%20TX
1,Palm Oaks Ranch 1521RP,"Welcome to Palms Oaks Ranch! This gorgeous two-story 4 bedroom/3 bath Mediterranean style home on 18 acres over looks a private lake. This property is surrounded by live oak trees with horse corrals, riding arena, stables, tack room and an ac dog run. The exterior of the home is stucco with a spanish style roof. Flooring is satillo tile with marble in the bathrooms. The kitchen features a gas cook top, dishwasher, stainless teal appliances and granite counter tops. The open floor plan features cedar ceilings and a fabulous staircase. Several large arch windows allow for plenty of light. Three of the bedrooms are located upstairs, with the master having an ensuite, king bed and a balcony overlooking the lake. The second bedroom has a king and the third a queen. The fourth bedroom is located downstairs and also has a king size bed. This property offers everything you need for a relaxing weekend away from it all. Roam the 18 acres watching the wildlife or fish into the sunset in the lake. This home is located just a 3 minute drive to Paradise Key. Paradise Key offers a double boat dock, Valet boat barn and a dockside bar and grill. This is a great spot for Weddings, special occasion parties or a weekend away. Wedding venue fee is $500. For a virtual tour click here",Rockport,https://www.airbnb.com/rooms/12009441?location=Bayside%2C%20TX
2,Furnished EastSide Flat with a Spacious Balcony.,"Welcome to our home.\n\nLocated on the East Side of Austin just steps from Austin’s newest rail stop, Platform is an eclectic combination of hip and modern all under one roof. \n\nComplete with a Retro Game Lounge, super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations for sizzling social hours, yoga studio, fitness and conditioning center, and so much more. \n\nGet in your groove and get booking.",Austin,https://www.airbnb.com/rooms/18123352?location=Cedar%20Park%2C%20TX
3,Comfy Flat on the EastSide + a Balcony & by Metro!,"Welcome :)\n\nAn eclectic combination of hip and modern all under one roof.\n\nCool Amenities like Retro Lounge, Super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations, yoga studio, and fitness center.\n\nThe Metro Train is connected to the apartment building and will take you directly to downtown in 8 minutes or less depending on your drop off location. So convenient, no need for a car or anything!",Austin,https://www.airbnb.com/rooms/18795759?location=Cedar%20Park%2C%20TX


In [None]:
term_ids = [vocabulary[token] for token in query_tokens if token in vocabulary]

In [8]:
#TF = how may times the word is repeated within the i-th document divided by its length (term frequency)
def tf(word, document):
    return document.count(word) / len(document) 

#this is to have the number of times the word is repeated in one document
def document_frequency(word):
    if word in vocabulary:
        term_id = vocabulary[word]
    return len(inverted_index[term_id])

#IDF(word) = loge(Total Number Of Documents / Number Of Documents with term 'word' in it)
def idf(word):
    return math.log(len(docpaths) / document_frequency(word))

#TFIDF = TF * IDF
def tfidf(word, document):
    return tf(word, document) * idf(word)

In [10]:
vocabulary = open('vocabulary.txt', 'r', encoding = 'utf-8')
vocabulary = eval(vocabulary.read()) 

inverted_index = open('inverted_index.txt', 'r', encoding = 'utf-8') 
inverted_index = eval(inverted_index.read())

document_list = open('documentlist.txt', 'r', encoding = 'utf-8')
file = eval(document_list.read())

new_inverted_index = {}
for key,doc in tqdm(documentlist.items()): #taking the keys (doc_i) and the values 
    for w in doc:#w = word in the doc, doc = ['house', 'garden', ....]
        score = tfidf(w,doc) #tf idf of the word in the doc
        w_index = (key,score) #tuple of key (doc_i), and score (tf/idf)
        if vocabulary[w] not in new_inverted_index: #if the id of the word is not in the new inv_idx
            temp = [] #initializing empty list
            temp.append(w_index) #appending the tuple
            new_inverted_index[vocabulary[w]] = temp #giving to the newly created key (that is the id), the list just created [(doc_i, tf/idf), (doc_i+1, tf/idf)]
        elif doc not in new_inverted_index[vocabulary[w]]: #if the doc is not already in that list: 
            new_inverted_index[vocabulary[w]].append(w_index) #append to the list the new tuple

#removing duplicates
for key in new_inverted_index:
    new_inverted_index[key] = list(set(new_inverted_index[key]))
#saving
save_dict_to_file(new_inverted_index,"inverted_index_tfidf")

100%|███████████████████████████████████████████████████████████████████████████| 18259/18259 [00:38<00:00, 473.71it/s]


In [15]:
query = input() #input query
tokens = nltk.word_tokenize(query) #tokenization
query_tokens = [nltk.stem.PorterStemmer().stem(token) for token in tokens if token not in stopwords.words('english') if token not in ".,'()"]
#removing stopwords, special characters, stemming

tfidf_query_array = [] 
for w in query_tokens: #scores 
    score = 1/math.sqrt(len(query_tokens)) #score for the words the 1/math.sqrt(len(query_tokens)) is to normalize the scores
    tfidf_query_array.append(score) #appending to a list for new score
#print(tfidf_query_array)

roof dog


In [16]:
inverted_index_tfidf = open("inverted_index_tfidf.txt", 'r', encoding = 'utf-8') #opening the new invd_idx and evaluating 
inverted_index_tfidf = eval(inverted_index_tfidf.read())


term_ids = [vocabulary[token] for token in query_tokens if token in vocabulary] #terms ids of the words of the query
search_results = [inverted_index_tfidf[idx] for idx in term_ids if idx in inverted_index_tfidf] #list of tuples (doc_i, score) for that id

cos_arrays = {} #dictionary for the cosine sim values for the docs
for list_ in search_results: #for every list of tuple (doc_i, score) for every id
    for tuple_ in list_: #for every tuple:
        if tuple_[0] not in cos_arrays: #if the doc is not in that tuple
            temp = []
            temp.append(tuple_[1]) #append to the newly created list the tf/idf
            cos_arrays[tuple_[0]] = temp #create the key (doc_i) and give the values as the list of tf/idfs
        else:
            cos_arrays[tuple_[0]].append(tuple_[1]) #if the key is already there append the value to the list

final = [(key,cos_arrays[key]) for key in cos_arrays if len(cos_arrays[key]) == len(term_ids)] #take the documents that have all of the words
print(final)

[('doc_7428.tsv', [0.11172773927519879, 0.06463152033532973]), ('doc_8919.tsv', [0.09969552119940815, 0.05767120276075577]), ('doc_14141.tsv', [0.04050130548725956, 0.02342892612155703]), ('doc_6904.tsv', [0.10286045838034173, 0.05950203459443055])]


In [17]:
doc_sim = {} #dic for {doc_i : similarity_score, ...}
for tuple_ in final: #take the tuple in final (the values are the list of the tf/idf scores)
    sim = 1 - (scipy.spatial.distance.cosine(np.array(tfidf_query_array) , np.array(tuple_[1]))) #use this to compute the the cos similarity
    doc_sim[tuple_[0]] =  sim #key = doc_i, value = similarity
print(doc_sim)

{'doc_7428.tsv': 0.9661433851761906, 'doc_8919.tsv': 0.9661433851761907, 'doc_14141.tsv': 0.9661433851761906, 'doc_6904.tsv': 0.9661433851761907}


In [18]:
heap = [(-value, key) for key, value in doc_sim.items()] #list that has a tuple ordered by the smallest to the largest
largest = heapq.nsmallest(10, heap) #taking the smallest (they will actually be the largest)
largest = [(key, -value) for value, key in largest] #re-ordering keys and values
print(largest)

col_names = ["average_rate_per_night","bedrooms_count","city","date_of_listing","description","latitude","longitude","title","url", "similarity"]
df = pd.DataFrame(columns = col_names)#same as before

for j, doc in enumerate(largest): 
    with open(r'doc\{}'.format(doc[0]), 'r',encoding="utf-8") as file:
        
        csv_reader = csv.reader(file, delimiter = '\t')
        for line in csv_reader:
            for i, field in enumerate(line):
                df.loc[j, col_names[i]] = field
                df.loc[j, 'similarity'] = doc[1]
        
pd.set_option('max_colwidth',500)
display(df[['title', 'description', 'city', 'url', 'similarity']].style.set_table_styles([ {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}]))

[('doc_6904.tsv', 0.9661433851761907), ('doc_8919.tsv', 0.9661433851761907), ('doc_14141.tsv', 0.9661433851761906), ('doc_7428.tsv', 0.9661433851761906)]


Unnamed: 0,title,description,city,url,similarity
0,Cozy furnished home on the EastSide!,"Welcome to our home!\n\nAn eclectic combination of hip and modern all under one roof.\n\nCool Amenities like Retro Lounge, Super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations, yoga studio, and fitness center.\n\nThe Metro Train is connected to the apartment building and will take you directly to downtown in 8 minutes or less depending on your drop off location. So convenient, no need for a car or anything!",Austin,https://www.airbnb.com/rooms/18795604?location=Cedar%20Park%2C%20TX,0.966143
1,Comfy Flat on the EastSide + a Balcony & by Metro!,"Welcome :)\n\nAn eclectic combination of hip and modern all under one roof.\n\nCool Amenities like Retro Lounge, Super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations, yoga studio, and fitness center.\n\nThe Metro Train is connected to the apartment building and will take you directly to downtown in 8 minutes or less depending on your drop off location. So convenient, no need for a car or anything!",Austin,https://www.airbnb.com/rooms/18795759?location=Cedar%20Park%2C%20TX,0.966143
2,Palm Oaks Ranch 1521RP,"Welcome to Palms Oaks Ranch! This gorgeous two-story 4 bedroom/3 bath Mediterranean style home on 18 acres over looks a private lake. This property is surrounded by live oak trees with horse corrals, riding arena, stables, tack room and an ac dog run. The exterior of the home is stucco with a spanish style roof. Flooring is satillo tile with marble in the bathrooms. The kitchen features a gas cook top, dishwasher, stainless teal appliances and granite counter tops. The open floor plan features cedar ceilings and a fabulous staircase. Several large arch windows allow for plenty of light. Three of the bedrooms are located upstairs, with the master having an ensuite, king bed and a balcony overlooking the lake. The second bedroom has a king and the third a queen. The fourth bedroom is located downstairs and also has a king size bed. This property offers everything you need for a relaxing weekend away from it all. Roam the 18 acres watching the wildlife or fish into the sunset in the lake. This home is located just a 3 minute drive to Paradise Key. Paradise Key offers a double boat dock, Valet boat barn and a dockside bar and grill. This is a great spot for Weddings, special occasion parties or a weekend away. Wedding venue fee is $500. For a virtual tour click here",Rockport,https://www.airbnb.com/rooms/12009441?location=Bayside%2C%20TX,0.966143
3,Furnished EastSide Flat with a Spacious Balcony.,"Welcome to our home.\n\nLocated on the East Side of Austin just steps from Austin’s newest rail stop, Platform is an eclectic combination of hip and modern all under one roof. \n\nComplete with a Retro Game Lounge, super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations for sizzling social hours, yoga studio, fitness and conditioning center, and so much more. \n\nGet in your groove and get booking.",Austin,https://www.airbnb.com/rooms/18123352?location=Cedar%20Park%2C%20TX,0.966143


# step 4

In [19]:
df4 = pd.read_csv('Airbnb_Texas_Rentals.csv') #creating dataframe from the csv
city_lat_lon = df4.groupby('city').mean()[['latitude','longitude']] #grouping by the city and taking the mean.

In [21]:
user_query_city = input('Enter city: ') #asking the user for the city
user_query_price = input('Enter your preferred daily price: ') #asking the user for the price
user_query_bedrooms = input('Enter your preferred number of bedrooms: ') #asking the user for the bedrooms

Enter city: San Antonio
Enter your preferred daily price: 40
Enter your preferred number of bedrooms: 2


In [22]:
def score(doc_price ,doc_bedrooms, doc_latitude, doc_longitude): #defining the new score
    global user_query_city, user_query_price , user_query_bedrooms, city_lat_lon #calling the global variables inside the function
    
    score = 0 #score default
    
    city = user_query_city #defining the new variable city
    price = int(user_query_price .replace('$', '')) #if in the price the user uses a dollar, we take it out
    bedrooms = int(user_query_bedrooms) #just the number of bedrooms

    if city in city_lat_lon.index: #if the user's city is in the index of the groupby
        lat_lon = list(city_lat_lon.loc[city]) #take the long and lat of the row with the name
        lat_lon_diff = [float(lat_lon[0]) - float(doc_latitude), float(lat_lon[1]) - float(doc_longitude)] #diffrence between longitude and latitude

        if (doc_price[1:].isdigit() == True) and (doc_bedrooms.isdigit() == True): #if price and bedrooms are actual numbers:
            if (abs(lat_lon_diff[0]) < 1.5) and (abs(lat_lon_diff[1]) < 1.5): #if the difference is in a max range of 1.5 has full score
                score = (1/(1 + (abs(float(doc_price[1:]) - price))) * 0.6) + (1/(1 + (abs(int(doc_bedrooms) - bedrooms)))) * 0.4
            else: #if its outside the penalty is score*0.1
                score = ((1/(1 + (abs(float(doc_price[1:]) - price))) * 0.6) + (1/(1 + (abs(int(doc_bedrooms) - bedrooms)))) * 0.4) * 0.1
                #score is done using the 1/1+x, where x is the difference of prices or bedrooms. This means that if the difference is large the score will be lower
    return score
    

In [25]:
step4_idx = {} #dictionary that will map to every document the new score
for j, doc in enumerate(intersect): #taking the documents of the intersection in point 3.1
    with open(r'doc\{}'.format(doc), 'r',encoding="utf-8") as file: #opening the tsv files
        lines = file.readlines() #reading the files
        if len(lines) > 0 : 
            document = lines[0] #as before...
            cols = document.replace('\n',' ').split('\t') 
            doc_price, doc_bedrooms, doc_latitude, doc_longitude = cols[0], cols[1], cols[5], cols[6]
            new_score = score(doc_price ,doc_bedrooms, doc_latitude, doc_longitude)
            step4_idx[doc] = new_score #key = doc_i, value = new_score
            
save_dict_to_file(step4_idx,"step4_idx")
print(step4_idx)

$119 1
San Antonio
[29.501488198655036, -98.50031031977231]
[-0.780225875155363, -0.7881218921176156]
0.20750000000000002
$549 4
San Antonio
[29.501488198655036, -98.50031031977231]
[1.5082104470680342, -1.4029268944314168]
0.013450980392156864
$131 1
San Antonio
[29.501488198655036, -98.50031031977231]
[-0.7803068948945651, -0.7898950225510077]
0.20652173913043478
$118 1
San Antonio
[29.501488198655036, -98.50031031977231]
[-0.7801520025914641, -0.7885032559082106]
0.20759493670886078
{'doc_6904.tsv': 0.20750000000000002, 'doc_14141.tsv': 0.013450980392156864, 'doc_7428.tsv': 0.20652173913043478, 'doc_8919.tsv': 0.20759493670886078}


In [26]:
heap = [(-value, key) for key, value in step4_idx.items()] #same as before
largest = heapq.nsmallest(5, heap)
largest = [(key, -value) for value, key in largest]
print(largest)

col_names = ["average_rate_per_night","bedrooms_count","city","date_of_listing","description","latitude","longitude","title","url", "score"]
df = pd.DataFrame(columns = col_names)# index = False?

for j, doc in enumerate(largest):
    with open(r'doc\{}'.format(doc[0]), 'r',encoding="utf-8") as file:
        
        csv_reader = csv.reader(file, delimiter = '\t')
        for line in csv_reader:
            for i, field in enumerate(line):
                df.loc[j, col_names[i]] = field
                df.loc[j, 'score'] = doc[1]
        
pd.set_option('max_colwidth',500)
display(df[['title', 'description', 'city', 'url', 'score']].style.set_table_styles([ {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}]))

[('doc_8919.tsv', 0.20759493670886078), ('doc_6904.tsv', 0.20750000000000002), ('doc_7428.tsv', 0.20652173913043478), ('doc_14141.tsv', 0.013450980392156864)]


Unnamed: 0,title,description,city,url,score
0,Comfy Flat on the EastSide + a Balcony & by Metro!,"Welcome :)\n\nAn eclectic combination of hip and modern all under one roof.\n\nCool Amenities like Retro Lounge, Super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations, yoga studio, and fitness center.\n\nThe Metro Train is connected to the apartment building and will take you directly to downtown in 8 minutes or less depending on your drop off location. So convenient, no need for a car or anything!",Austin,https://www.airbnb.com/rooms/18795759?location=Cedar%20Park%2C%20TX,0.207595
1,Cozy furnished home on the EastSide!,"Welcome to our home!\n\nAn eclectic combination of hip and modern all under one roof.\n\nCool Amenities like Retro Lounge, Super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations, yoga studio, and fitness center.\n\nThe Metro Train is connected to the apartment building and will take you directly to downtown in 8 minutes or less depending on your drop off location. So convenient, no need for a car or anything!",Austin,https://www.airbnb.com/rooms/18795604?location=Cedar%20Park%2C%20TX,0.2075
2,Furnished EastSide Flat with a Spacious Balcony.,"Welcome to our home.\n\nLocated on the East Side of Austin just steps from Austin’s newest rail stop, Platform is an eclectic combination of hip and modern all under one roof. \n\nComplete with a Retro Game Lounge, super “fly” poolscape consisting of three pools and intriguing Aquatic Wall water feature, Dog Park, outdoor grilling stations for sizzling social hours, yoga studio, fitness and conditioning center, and so much more. \n\nGet in your groove and get booking.",Austin,https://www.airbnb.com/rooms/18123352?location=Cedar%20Park%2C%20TX,0.206522
3,Palm Oaks Ranch 1521RP,"Welcome to Palms Oaks Ranch! This gorgeous two-story 4 bedroom/3 bath Mediterranean style home on 18 acres over looks a private lake. This property is surrounded by live oak trees with horse corrals, riding arena, stables, tack room and an ac dog run. The exterior of the home is stucco with a spanish style roof. Flooring is satillo tile with marble in the bathrooms. The kitchen features a gas cook top, dishwasher, stainless teal appliances and granite counter tops. The open floor plan features cedar ceilings and a fabulous staircase. Several large arch windows allow for plenty of light. Three of the bedrooms are located upstairs, with the master having an ensuite, king bed and a balcony overlooking the lake. The second bedroom has a king and the third a queen. The fourth bedroom is located downstairs and also has a king size bed. This property offers everything you need for a relaxing weekend away from it all. Roam the 18 acres watching the wildlife or fish into the sunset in the lake. This home is located just a 3 minute drive to Paradise Key. Paradise Key offers a double boat dock, Valet boat barn and a dockside bar and grill. This is a great spot for Weddings, special occasion parties or a weekend away. Wedding venue fee is $500. For a virtual tour click here",Rockport,https://www.airbnb.com/rooms/12009441?location=Bayside%2C%20TX,0.013451


# junk

In [6]:
with open('Airbnb_Texas_Rentals_new.csv','r',encoding="utf8") as csv_file:
    csv_reader=csv.reader(csv_file)
    
    with open('Parametric_search.tsv','w',encoding="utf8") as new_file: 
        csv_writer=csv.writer(new_file,delimiter='\t')
    
        for line in csv_reader:
               csv_writer.writerow(line[2:5]) 

with open('Parametric_search.tsv','r',encoding="utf8") as all_files:
    for i, row in enumerate(all_files):
        if i > 0 and i < 40:
            if row != '\n':
                with open('doc_par_search\doc_' + str(int((i-(i/2)))) + '.tsv','w',encoding="utf8") as new_files:
                    new_files.write(row) 

In [15]:
docpaths = [f for f in listdir("doc_par_search") if isfile(join("doc_par_search", f))]

user_query = input('Enter city, price per night, number of bedrooms: ').split(' ')
city, price, bedrooms = user_query[0], user_query[1], user_query[2]



par_search_doc_list = {}
for doc in docpaths:
    with open('doc_par_search\{}'.format(doc), 'r',encoding="utf8") as file:
        lines = file.readlines()
        if len(lines) > 0 :
            document = lines[0]
            cols = document.replace('\n', '').split('\t')
            #doc_price, doc_bedrooms, doc_city = cols[1], cols[2], cols[3]
            #print(doc_price, doc_bedrooms, doc_city)
            par_search_doc_list[doc] = list(cols)
            
save_dict_to_file(par_search_doc_list,"par_search_doc_list")

Enter city, price per night, number of bedrooms: a b c


In [None]:
#par_search_inv_idx = {}
#with open('par_search_doc_list.txt', 'r', encoding="utf-8") as file:
    data = eval(file.read())
    for key,doc in documentlist.items():
        for w in doc:
            score = tfidf(w,doc,documentlist)
            w_index = (key,score)
            if vocabulary[w] not in new_inverted_index:
                temp = []
                temp.append(w_index)
                new_inverted_index[vocabulary[w]] = temp
            elif doc not in new_inverted_index[vocabulary[w]]:
                new_inverted_index[vocabulary[w]].append(w_index)    