**Purpose of this this notebook:** Recommender system.

In [3]:
#Mount my Google Drive.
from google.colab import drive
drive.mount("/content/drive")
import os
directory = '/content/drive/My Drive/DSC 672'
os.chdir(directory)

#Ensure the files are there (in the folder).
!pwd

Mounted at /content/drive
/content/drive/My Drive/DSC 672


In [1]:
!pip install -q spacy
!pip install -q scikit-learn
!pip install -q folium
!pip install -q geopandas
import pandas as pd
import numpy as np
import time
import ast
import re
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.metrics.pairwise import cosine_similarity

!pip install -q geopandas
import folium
from folium.plugins import MarkerCluster
import geopandas as gpd

#For warnings.
import warnings
warnings.filterwarnings("ignore")

# **Load data.**

In [None]:
data = pd.read_csv("final_data.csv")
data['amenities'] = data['amenities'].apply(ast.literal_eval)
#Convert all amenities, neighborhood name to lower case.
data['amenities'] = data['amenities'].apply(lambda x: [item.lower() for item in x])
data['neighbourhood_cleansed'] = data['neighbourhood_cleansed'].str.lower()
data = data.rename(columns={'neighbourhood_cleansed': 'neighbourhood'})
#Convert Boolean attribute to "True" and "False".
data['host_is_superhost'] = data['host_is_superhost'].map({'t': True, 'f': False})
data['host_identity_verified'] = data['host_identity_verified'].map({'t': True, 'f': False})
#Convert the 'embeddings' column to lists of floats.
data['embedding'] = data['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ') if isinstance(x, str) else x)
#Convert 'train_station' to correct format.
data['train_station'] = data['train_station'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
print (f"Final; dataset shape: {data.shape}")
data.head(2)

Final; dataset shape: (7952, 19)


Unnamed: 0,id,host_is_superhost,host_identity_verified,neighbourhood,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,amenities,price,name_description,avgWeighted_VADER,tokens,embedding,cluster_name,train_station
0,16757056,True,True,lower west side,41.84656,-87.68345,Entire home/apt,5,1.0,2.0,3.0,"[shampoo, dedicated workspace, wifi, private e...",103.0,Heart of Chicago 2BD + Parking Comfortably fit...,0.794457,"['heart', 'parking', 'comfortably', 'fit', 'pr...","[0.00701811, 0.28103456, -0.28579983, -0.10812...",Accessible with Public Transportation,[]
1,48112045,False,False,west town,41.89479,-87.69336,Entire home/apt,6,1.0,2.0,2.0,"[shampoo, dining table, window ac unit, wifi, ...",169.0,West Town Chicago Apt near Metra and United Ce...,0.726389,"['west', 'town', 'apt', 'near', 'metra', 'unit...","[0.11143691, 0.33038473, -0.13804461, 0.018835...",Accessible with Public Transportation,[]


# **Tokenize and Create Embedding for User's Query.**

In [None]:
def processText(text):
  '''Tokenize text function using Spacy's default stop words.'''
  #Lowering all text.
  text = re.sub(r'[^\w\s]', '', text.lower())
  #Filter out tokens.
  filteredTokens = []
  filteredTokens = [token.text for token in nlp(text)
        if token.is_alpha and not token.is_stop and len(token) > 2]
  return filteredTokens

In [None]:
def loadGlove(vectorLen=50):
  '''Function load 50-dimensional GloVe embeddings.'''
  #Vector_len must be 50/100/200/300.
  GloVe = f"glove.6B.{vectorLen}d.txt"
  #Dictionary stores embeddings of all words.
  embeddingsDict = {}
  with open(GloVe, 'r', encoding="utf-8") as f:
    for line in f:
      values = line.split()
      word = str(values[0])
      vector = np.asarray(values[1:], "float32")
      embeddingsDict[word] = vector
  return embeddingsDict
gloveEmbeddings = loadGlove(50)

In [None]:
def meanEmbedding(tokens, embeddingDim=50):
  '''Function to compute mean embedding of a list of words.'''
  embeddings = [gloveEmbeddings[word] for word in tokens if word in gloveEmbeddings]
  if embeddings:
    return np.mean(embeddings, axis=0)
  else:
    #Return a zero vector if no embeddings are found.
    return np.zeros(embeddingDim)

# **Get User's Basic Requirements.**

*   Output stored as dictionary.

In [None]:
def get_userPreferences():
  '''Ask user for basic preferences with validation checks.
  If user hit enter, consider as None.'''
  try:
    def positive_intInput(prompt):
      '''Function to get integer input with validation for accommodates (guest count) >= 1.'''
      while True:
        value = input(prompt)
        if not value:
          print("This field is required. Please enter a number greater than or equal to 1.")
        else:
          try:
            value = int(value)
            if value >= 1:
              return value
            else:
              print("Please enter a number greater than or equal to 1.")
          except ValueError:
              print("Invalid input. Please enter a valid number.")

    def greater_than_zero_intInput(prompt):
      '''Function to get integer input greater than 0'''
      while True:
        value = input(prompt)
        if not value:
          return None
        try:
          value = int(value)
          if value > 0:
            return value
          else:
            print("Please enter a number greater than 0.")
        except ValueError:
          print("Invalid input. Please enter a valid number.")

    accommodates = positive_intInput("Minimum number of guest(s) (must be 1 or more): ")
    price = greater_than_zero_intInput("Maximum price: ")
    bathrooms = greater_than_zero_intInput("Minimum number of bathroom(s): ")
    bedrooms = greater_than_zero_intInput("Minimum number of bedroom(s): ")
    beds = greater_than_zero_intInput("Minimum number of beds: ")

    #Boolean input for host_is_superhost	host_identity_verified.
    verifiedHost = input("Only verified host (Y/N): ").strip().upper()
    if verifiedHost == "YES":
      verifiedHost = "Y"
    elif verifiedHost == "NO":
      verifiedHost = "N"
    verifiedHost = True if verifiedHost == "Y" else False if verifiedHost == "N" else None
    superHost = input("Only super host (Y/N): ").strip().upper()
    if superHost == "YES":
      superHost = "Y"
    elif superHost == "NO":
      superHost = "N"
    superHost = True if superHost == "Y" else False if superHost == "N" else None

    #List input for neighborhood and amenities.
    neighborhood = input("Preferred neighborhood(s), separated by commas: ")
    neighborhood = [n.strip().lower() for n in neighborhood.split(",")] if neighborhood else []
    amenities = input("Preferred amenities, separated by commas: ")
    amenities = [a.strip().lower() for a in amenities.split(",")] if amenities else []

    #Store inputs in a dictionary.
    userPreferences = {"accommodates": accommodates,
                       "price": price,
                       "bathrooms": bathrooms,
                       "bedrooms": bedrooms,
                       "beds": beds,
                       "verified_host": verifiedHost,
                       "super_host": superHost,
                       "neighborhood": neighborhood,
                       "amenities": amenities}
    return userPreferences

  except ValueError:
    print("Unexpected input error. Please try again.")
    return get_userPreferences()

# **Get User's Comment Preferences.**

*   Output is a mean embedding.

In [None]:
def get_user_meanEmbedding():
  '''Ask user for a short description and calculate mean embedding.'''
  #Step 1: Get user input.
  userInput = input("Please tell us what you like: ")
  if not userInput.strip():
    user_meanEmbedding = None
  else:
    #Step 2: Tokenize and filter the text.
    tokens = processText(userInput)
    #Step 3: Compute mean embedding.
    user_meanEmbedding = meanEmbedding(tokens)
  return user_meanEmbedding

# **Filter Data for Tailored to User's Input and Comment.**

In [None]:
def filterListings(userPreferences, userEmbedding):
  '''Filter Listings data based on user preferences and comment embedding similarity.'''

  #Step 1: Apply numeric and boolean filters.
  accommodatesFilter = data['accommodates'] == userPreferences['accommodates'] if userPreferences['accommodates'] is not None else True
  priceFilter = data['price'] <= userPreferences['price'] if userPreferences['price'] is not None else True
  bathroomsFilter = data['bathrooms'] >= userPreferences['bathrooms'] if userPreferences['bathrooms'] is not None else True
  bedroomsFilter = data['bedrooms'] >= userPreferences['bedrooms'] if userPreferences['bedrooms'] is not None else True
  bedsFilter = data['beds'] >= userPreferences['beds'] if userPreferences['beds'] is not None else True

  verified_hostFilter = (data['host_identity_verified'] == True) if userPreferences['verified_host'] == True else True
  super_hostFilter = (data['host_is_superhost'] == True) if userPreferences['super_host'] == True else True

  #Combine all filters.
  tailoredListings = data[accommodatesFilter &
                          priceFilter &
                          bathroomsFilter &
                          bedroomsFilter &
                          bedsFilter &
                          verified_hostFilter &
                          super_hostFilter]

  #Step 2: Neighborhood filter.
  if userPreferences['neighborhood']:
    tailoredListings = tailoredListings[tailoredListings['neighbourhood']
                                      .isin(userPreferences['neighborhood'])]

  #Step 3: Amenities filter.
  if userPreferences['amenities']:
    tailoredListings = tailoredListings[tailoredListings['amenities'].apply(
        lambda am: any(any(keyword in amen for amen in am) for keyword in userPreferences['amenities']))]

  #Step 4: Embedding similarity filter.
  if userEmbedding is not None:
    #Create a copy of tailoredListings to avoid SettingWithCopyWarning.
    tailoredListings = tailoredListings.copy()

    #Calculate cosine similarity, set NaN for rows with None embeddings.
    tailoredListings['similarity'] = tailoredListings['embedding'].apply(
        lambda emb: cosine_similarity([userEmbedding], [emb])[0][0] if emb is not None else np.nan)
  else:
    tailoredListings = tailoredListings.copy()
    #Set entire similarity column to NaN if userEmbedding is None.
    tailoredListings['similarity'] = np.nan

  #Sort tailoredListings by similarity score.
  tailoredListings = tailoredListings.sort_values(by='similarity', ascending=False, na_position='last')
  return tailoredListings

In [None]:
def enhanceRanking(filteredData):
  '''Further rank listings by combining sentiment score and cosine similarity.
  Take data that have been filtered by user's requirements.
  Give more weight to user preference: 0.6 compared to sentiment score: 0.4.'''

  # Check if 'similarity' column has NaN values.
  if filteredData['similarity'].isna().all():
    #If all values in 'similarity' are NaN, rank solely by normalized vader score.
    filteredData['normalized_vader'] = (filteredData['avgWeighted_VADER'] - filteredData['avgWeighted_VADER'].min()) / \
                                       (filteredData['avgWeighted_VADER'].max() - filteredData['avgWeighted_VADER'].min())

    #Sort only by vader score in descending order.
    rankedData = filteredData.sort_values(by='normalized_vader', ascending=False)

  else:
    #Normalize avgWeighted_VADER and similarity to a 0-1 range for comparability.
    filteredData['normalized_vader'] = (filteredData['avgWeighted_VADER'] - filteredData['avgWeighted_VADER'].min()) / \
                                      (filteredData['avgWeighted_VADER'].max() - filteredData['avgWeighted_VADER'].min())
    filteredData['normalized_similarity'] = (filteredData['similarity'] - filteredData['similarity'].min()) / \
                                            (filteredData['similarity'].max() - filteredData['similarity'].min())

    #Define weights for each scoring metric.
    vaderWeight = 0.4
    similarityWeight = 0.6

    #Calculate a combined score.
    filteredData['combined_score'] = (vaderWeight * filteredData['normalized_vader']) + \
                                    (similarityWeight * filteredData['normalized_similarity'])

    #Sort in descending order.
    rankedData = filteredData.sort_values(by='combined_score', ascending=False)
  return rankedData

# **Get Top-k Listings.**

In [None]:
def topK_listings(tailoredListings):
  '''Get top k listings based on user preferences for cluster characteristics.'''
  totalListings = len(tailoredListings)
  print(f"There are {totalListings} listings that match your requirements.")

  #Prompt user for k and validate input.
  while True:
    k = input("How many listings do you want us to recommend? (Enter a number or leave blank for all): ")

    #Return all listings if input is blank.
    if not k:
      print("Here are all listings that satisfy your request.")
      return tailoredListings

    try:
      k = int(k)
      if k == 0:
        print("Here are all listings that satisfy your request.")
        return tailoredListings
      elif 1 <= k <= totalListings:
        break
      else:
        print(f"Please enter a number between 1 and {totalListings}.")
    except ValueError:
      print("Invalid input. Please enter a valid number.")

  #Ask for characteristics preferences.
  culturalScenic = input("Do you want listings in 'Cultural and Scenic Attractions'? (Y/N): ").strip().upper()
  if culturalScenic in ["Y", "YES"]:
    culturalScenic = True
  elif culturalScenic in ["N", "NO"]:
    culturalScenic = False
  else:
    culturalScenic = None

  vibrantSocial = input("Do you want listings in 'Vibrant Social Spaces'? (Y/N): ").strip().upper()
  if vibrantSocial in ["Y", "YES"]:
    vibrantSocial = True
  elif vibrantSocial in ["N", "NO"]:
    vibrantSocial = False
  else:
    vibrantSocial = None

  publicTransport = input("Do you want listings 'Accessible with Public Transportation'? (Y/N): ").strip().upper()
  if publicTransport in ["Y", "YES"]:
    publicTransport = True
  elif publicTransport in ["N", "NO"]:
    publicTransport = False
  else:
    publicTransport = None

  #Filter based on selected clusters.
  selectedClusters = []
  if culturalScenic:
    selectedClusters.append("Cultural and Scenic Attractions")
  if vibrantSocial:
    selectedClusters.append("Vibrant Social Spaces")
  if publicTransport:
    selectedClusters.append("Accessible with Public Transportation")

  #Calculate number of listings per selected cluster
  numSelected = len(selectedClusters)
  if numSelected == 0:
    print(f"No clusters selected. Here are top {k} listings that satisfy your request:")
    return tailoredListings.head(k)

  #Shuffle clusters randomly to add variability in the remainder distribution.
  np.random.shuffle(selectedClusters)

  listings_perCluster = k // numSelected
  #To handle any remainder for distributing listings evenly.
  remainder = k % numSelected

  #Retrieve top listings for each selected cluster.
  final_topK_listings = []
  pickedIDs = set()
  for i, cluster in enumerate(selectedClusters):
    #Determine the count for this cluster, adding 1 if there's a remainder to handle.
    count = listings_perCluster + (1 if i < remainder else 0)

    #Filter and sort listings by the combined score for the current cluster
    clusterListings = tailoredListings[tailoredListings['cluster_name'] == cluster]
    if len(clusterListings) < count:
      #If not enough listings, add the remainder from the full set.
      cluster_topListings = clusterListings.head(count)
      pickedIDs.update(cluster_topListings.index)

      #Calculate additional listings needed and pick randomly from unselected listings.
      additionalNeeded = count - len(clusterListings)
      remainingListings = tailoredListings[~tailoredListings.index.isin(pickedIDs)]
      additionalListings = remainingListings.sample(additionalNeeded)
      pickedIDs.update(additionalListings.index)

      #Combine both selections.
      cluster_topListings = pd.concat([cluster_topListings, additionalListings])

    else:
      #Sufficient listings in cluster, just pick the top ones.
      cluster_topListings = clusterListings.head(count)
      pickedIDs.update(cluster_topListings.index)

    #Append to top_k_listings.
    final_topK_listings.append(cluster_topListings)

  #Concatenate the results for each cluster.
  result = pd.concat(final_topK_listings).reset_index(drop=True)
  return result

# **Geo Map for Recommendation.**

In [None]:
def extract_trainStations(top_k_listings):
  '''Extract train stations from the 'train_station' column in top k Listings.'''
  trainData = []

  for _, row in top_k_listings.iterrows():
    #Check if there are train stations.
    if row['train_station']:
      for station in row['train_station']:
        trainName, stationCoords = station
        if stationCoords and isinstance(stationCoords, tuple) and len(stationCoords) == 2:
          trainData.append({'latitude': float(stationCoords[0]),
                            'longitude': float(stationCoords[1]),
                            'type': 'Train Station',
                            'popup': f"Station: {trainName}"})
  final_trainData = pd.DataFrame(trainData)
  return final_trainData

In [None]:
def geoMap(top_k_listings):
  '''Plot listings and extracted train stations on a single Folium map.'''

  #Extract train stations and add type and popup columns.
  trainData = extract_trainStations(top_k_listings)
  #Add columns to distinguish types and set popups for listings.
  top_k_listings['type'] = 'Listing'

  def generatePopup(row):
    '''Show price, neighborhood, and its nearby train stations for each listing.'''

    popupContent = f"<strong>Price:</strong> ${row['price']}<br><strong>Neighborhood:</strong> {row['neighbourhood']}"
    if row['train_station']:
      stationInfo = "<br><strong>Nearby Train Stations:</strong>"
      for station in row['train_station']:
                trainName, stationCoords = station
                stationInfo += f"<br> - {trainName}"
      popupContent += stationInfo
    return popupContent

  top_k_listings['popup'] = top_k_listings.apply(generatePopup, axis=1)

  #Combine listings and train stations data.
  combinedData = pd.concat([top_k_listings[['latitude', 'longitude', 'type', 'popup']], trainData], ignore_index=True)

  #Create a folium map centered around Chicago.
  m = folium.Map(location=[41.8781, -87.6298], zoom_start=11)
  #Use MarkerCluster for better visualization.
  markerCluster = MarkerCluster().add_to(m)

  #Add markers based on type.
  for idx, row in combinedData.iterrows():
    #Determine icon based on type.
    if row['type'] == 'Train Station':
      icon = folium.Icon(color='blue', icon='train', prefix='fa')
    elif row['type'] == 'Listing':
      icon = folium.Icon(color='red', icon='home', prefix='fa')

    #Add marker to the cluster.
    folium.Marker(location=[row['latitude'], row['longitude']],
                  popup=row['popup'],
                  icon=icon).add_to(markerCluster)
  return m

# **Main Function for Recommender System.**

In [None]:
#user_topK_listings = None

def main():
  '''Main function to initiate the recommender system.'''

  print("Welcome to the Airbnb Recommender System!")
  #global user_topK_listings
  #Step 1: Get user preferences.
  userPreferences = get_userPreferences()

  #Step 2: Get user comments.
  userEmbedding = get_user_meanEmbedding()

  #Step 3: Filter data given user requirements.
  user_filteredListings = filterListings(userPreferences, userEmbedding)

  #Step 4: Further rank with sentiment score.
  user_rankedListings = enhanceRanking(user_filteredListings)

  #Step 5: Get top k Listings.
  user_topK_listings = topK_listings(user_rankedListings)

  #Finalize the top K listings.
  final_user_topK_listings = user_topK_listings[[
        'host_is_superhost', 'host_identity_verified', 'neighbourhood', 'room_type',
        'accommodates', 'bathrooms', 'bedrooms', 'beds', 'amenities', 'price',
        'name_description', 'cluster_name', 'train_station']].reset_index(drop=True)

  final_user_topK_listings = final_user_topK_listings.rename(
        columns={'host_is_superhost': 'superhost', 'host_identity_verified': 'verified_host'})

  #Step 6: Geospatial Map of recommendations.
  map = geoMap(user_topK_listings)
  return final_user_topK_listings, map

if __name__ == "__main__":
  print("Here are your recommendations")
  final_user_topK_listings, map = main()
  display(final_user_topK_listings)
  display(map)

Here are your recommendations
Welcome to the Airbnb Recommender System!
Minimum number of guest(s) (must be 1 or more): 2
Maximum price: 400
Minimum number of bathroom(s): 1
Minimum number of bedroom(s): 1
Minimum number of beds: 1
Only verified host (Y/N): no
Only super host (Y/N): no
Preferred neighborhood(s), separated by commas: loop
Preferred amenities, separated by commas: ac
Please tell us what you like: somewhere, fun, nice, and sunny
There are 128 listings that match your requirements.
How many listings do you want us to recommend? (Enter a number or leave blank for all): 10
Do you want listings in 'Cultural and Scenic Attractions'? (Y/N): yes
Do you want listings in 'Vibrant Social Spaces'? (Y/N): yes
Do you want listings 'Accessible with Public Transportation'? (Y/N): yes


Unnamed: 0,superhost,verified_host,neighbourhood,room_type,accommodates,bathrooms,bedrooms,beds,amenities,price,name_description,cluster_name,train_station
0,False,True,loop,Entire home/apt,2,1.5,1.0,1.0,"[shampoo, dryer – in unit, washer – in unit, d...",210.0,"Blueground | South Loop, gym & lounge, nr park...",Cultural and Scenic Attractions,"[(('Quincy/Wells', 'Brown, Orange, Purple & Pi..."
1,False,True,loop,Entire home/apt,2,1.5,1.0,1.0,"[shampoo, dryer – in unit, washer – in unit, d...",174.0,"Blueground | South Loop, gym & lounge, nr park...",Cultural and Scenic Attractions,"[(('Quincy/Wells', 'Brown, Orange, Purple & Pi..."
2,False,True,loop,Entire home/apt,2,1.0,1.0,1.0,"[shampoo, dedicated workspace, wifi, private e...",154.0,"Blueground | South Loop, nr L & grant park Fee...",Cultural and Scenic Attractions,"[(('Roosevelt', 'Red, Orange & Green lines'), ..."
3,False,True,loop,Entire home/apt,2,1.0,1.0,1.0,"[shampoo, dryer – in unit, washer – in unit, d...",185.0,"Blueground | South Loop, gym & lounge, nr park...",Cultural and Scenic Attractions,"[(('Quincy/Wells', 'Brown, Orange, Purple & Pi..."
4,False,True,loop,Entire home/apt,2,1.0,1.0,1.0,"[shampoo, paid parking on premises, wifi, hot ...",170.0,★MODERN SLEEK 1BD FANTASTIC LAKE/CITY VIEWS★ ...,Vibrant Social Spaces,"[(('Washington/Wabash', 'Brown, Green, Orange,..."
5,False,True,loop,Entire home/apt,2,1.0,1.0,1.0,"[shampoo, paid parking on premises, tv with st...",160.0,GREAT LOCATION SPECTACULAR VIEWS + BALCONY One...,Vibrant Social Spaces,[]
6,False,False,loop,Entire home/apt,2,1.0,1.0,1.0,"[dining table, dedicated workspace, wifi, extr...",165.0,Beautiful Urban Bungalow | Grant Park Located ...,Vibrant Social Spaces,"[(('Roosevelt', 'Red, Orange & Green lines'), ..."
7,False,True,loop,Private room,2,1.0,1.0,2.0,"[shampoo, dining table, dedicated workspace, w...",226.0,Guest Bedroom Downtown In The Loop / River Nor...,Accessible with Public Transportation,"[(('Clark/Lake', 'Blue, Brown, Green, Orange, ..."
8,False,True,loop,Entire home/apt,2,1.0,1.0,1.0,"[shampoo, ethernet connection, dining table, c...",124.0,"Luxe Condo In DT Chicago w/ Fast Wi-fi, Pool, ...",Accessible with Public Transportation,"[(('Roosevelt', 'Red, Orange & Green lines'), ..."
9,False,True,loop,Entire home/apt,2,1.0,1.0,1.0,"[shampoo, dryer – in unit, washer – in unit, d...",170.0,"Blueground | Loop, roof & lounge, same block a...",Accessible with Public Transportation,"[(('Quincy/Wells', 'Brown, Orange, Purple & Pi..."


In [None]:
#Code cross check function filterListings() runs correctly.
#Will need to set listings data in main() to global.
res = data[(data['accommodates'] == 2) &
           (data['price'] <= 300) &
           (data['bathrooms'] >= 1) &
           (data['bedrooms'] >= 1) &
           (data['beds'] >= 1) &
           (data['host_identity_verified'] == True)]

In [4]:
!apt-get -qq install -y pandoc > /dev/null 2>&1
!apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install -y texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!jupyter nbconvert --to pdf "/content/drive/MyDrive/DSC 672/DSC672 - Recommender System.ipynb" > /dev/null 2>&1