In [181]:
# Dependencies
import pandas as pd
import json
import re
import ast

In [182]:
# Name of the CSV file
listings = 'data/sample_data/sample_ny_listings.csv'

In [183]:
# Read data into dataframe
listings_df = pd.read_csv(listings, encoding="ISO-8859-1")

In [184]:
listings_df.count()

id                                              100
listing_url                                     100
scrape_id                                       100
last_scraped                                    100
source                                          100
                                               ... 
calculated_host_listings_count                  100
calculated_host_listings_count_entire_homes     100
calculated_host_listings_count_private_rooms    100
calculated_host_listings_count_shared_rooms     100
reviews_per_month                                95
Length: 75, dtype: int64

In [185]:
listings_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [186]:
listings_amenities = listings_df[['id', 'amenities']]
listings_amenities.head()

Unnamed: 0,id,amenities
0,2595,"[""Dedicated workspace"", ""Baking sheet"", ""Extra..."
1,5121,"[""Kitchen"", ""Heating"", ""Air conditioning"", ""Wi..."
2,14991,"[""Air conditioning"", ""Refrigerator"", ""Elevator..."
3,5136,"[""Private patio or balcony"", ""Air conditioning..."
4,15341,"[""Dedicated workspace"", ""Conditioner"", ""Baking..."


In [187]:
# Count all amenities that include "TV" with only "TV"
specific_amenity = 'Shampoo'

specific_amenity_count = len(listings_amenities[listings_amenities['amenities'].str.contains(specific_amenity)])
print(f"Count: {specific_amenity_count}")

Count: 52


In [188]:
# Replace amenities
amenities_to_replace = ["Wifi", "TV", "Oven", "Stove", "Soap", "Shampoo", "Conditioner", "Sound system", "Refrigerator", "Backyard", "Patio", "BBQ grill", 
                        "Free parking", "Paid parking", "Free street parking", "Paid street parking"]

# Create a copy of the DataFrame
listings_amenities_replaced = listings_amenities.copy()

# Iterate through each row
for index, row in listings_amenities_replaced.iterrows():
    
    # Access the 'amenities' column for each row
    amenities_list = ast.literal_eval(row['amenities'])
    
    # Check if any amenity needs to be replaced
    for i, amenity in enumerate(amenities_list):
        for amenity_to_replace in amenities_to_replace:
            if amenity_to_replace.lower() in amenity.lower():
                
                # Replace the specific amenity with the one to replace with
                amenities_list[i] = amenity_to_replace
    
    # Convert the amenities list back to a string with double quotes
    updated_amenities = json.dumps(amenities_list)
    
    # Update the 'amenities' column in the new DataFrame
    listings_amenities_replaced.at[index, 'amenities'] = updated_amenities

# Print or display the modified DataFrame
listings_amenities_replaced.head()

Unnamed: 0,id,amenities
0,2595,"[""Dedicated workspace"", ""Baking sheet"", ""Extra..."
1,5121,"[""Kitchen"", ""Heating"", ""Air conditioning"", ""Wi..."
2,14991,"[""Air conditioning"", ""Refrigerator"", ""Elevator..."
3,5136,"[""Patio"", ""Air conditioning"", ""Children\u2019s..."
4,15341,"[""Dedicated workspace"", ""Conditioner"", ""Baking..."


In [189]:
# Define the string to find and the string to replace it with
string_to_find = "AC "
replacement_string = "Air conditioning"

# Iterate through each row
for index, row in listings_amenities_replaced.iterrows():
    
    # Access the 'amenities' column for each row
    amenities_list = ast.literal_eval(row['amenities'])
    
    # Process each amenity
    for i, amenity in enumerate(amenities_list):
        
        # Check if the string to find is present in the amenity
        if string_to_find.lower() in amenity.lower():
            
            # Replace the specified string
            amenities_list[i] = replacement_string
    
    # Convert the amenities list back to a string with double quotes
    updated_amenities = json.dumps(amenities_list)
    
    # Update the 'amenities' column in the new DataFrame
    listings_amenities_replaced.at[index, 'amenities'] = updated_amenities

# Print or display the modified DataFrame
listings_amenities_replaced.head()

Unnamed: 0,id,amenities
0,2595,"[""Dedicated workspace"", ""Baking sheet"", ""Extra..."
1,5121,"[""Kitchen"", ""Heating"", ""Air conditioning"", ""Wi..."
2,14991,"[""Air conditioning"", ""Refrigerator"", ""Elevator..."
3,5136,"[""Patio"", ""Air conditioning"", ""Children\u2019s..."
4,15341,"[""Dedicated workspace"", ""Conditioner"", ""Baking..."


In [190]:
# Create a copy of the DataFrame
listings_amenities_cleaned = listings_amenities_replaced.copy()

# Iterate through each row
for index, row in listings_amenities_cleaned.iterrows():
    
    # Access the 'amenities' column for each row
    amenities_list = ast.literal_eval(row['amenities'])
    
    # Process each amenity
    for i, amenity in enumerate(amenities_list):
        
        # Split amenity at colon, if present
        amenity_parts = amenity.split(':')
        
        # Take the first part as the cleaned amenity (remove characters after colon)
        cleaned_amenity = amenity_parts[0].strip()
        
        # Update the amenities list
        amenities_list[i] = cleaned_amenity
    
    # Convert the amenities list back to a string with double quotes
    updated_amenities = json.dumps(amenities_list)
    
    # Update the 'amenities' column in the new DataFrame
    listings_amenities_cleaned.at[index, 'amenities'] = updated_amenities

# Print or display the modified DataFrame
listings_amenities_cleaned.head()

Unnamed: 0,id,amenities
0,2595,"[""Dedicated workspace"", ""Baking sheet"", ""Extra..."
1,5121,"[""Kitchen"", ""Heating"", ""Air conditioning"", ""Wi..."
2,14991,"[""Air conditioning"", ""Refrigerator"", ""Elevator..."
3,5136,"[""Patio"", ""Air conditioning"", ""Children\u2019s..."
4,15341,"[""Dedicated workspace"", ""Conditioner"", ""Baking..."


In [191]:
# Initialize an empty dictionary to hold the amenity counts
amenity_counts_cleaned = {}

# Iterate through each row in the DataFrame
for index, row in listings_amenities_cleaned.iterrows():
    # Access the 'amenities' column for each row
    amenities_list = row['amenities']
    
    # Convert string representation to a list
    amenities = ast.literal_eval(amenities_list)
    
    # Count the occurrence of each amenity
    for amenity in amenities:
        amenity_counts_cleaned[amenity] = amenity_counts_cleaned.get(amenity, 0) + 1

# Convert the amenity_counts dictionary to a DataFrame, sort by descending counts
amenity_counts_df = pd.DataFrame(list(amenity_counts_cleaned.items()), columns=['Amenity', 'Count'])
amenity_counts_df = amenity_counts_df.sort_values(by='Count', ascending=False)
amenity_counts_df = amenity_counts_df.reset_index(drop=True)
amenity_counts_df.head(20)

Unnamed: 0,Amenity,Count
0,Wifi,99
1,Heating,90
2,Air conditioning,89
3,Smoke alarm,88
4,Kitchen,88
5,Essentials,80
6,Carbon monoxide alarm,79
7,Refrigerator,71
8,Hot water,71
9,TV,69


In [193]:
# Export to csv
amenity_counts_df.to_csv('sample_amenity_counts.csv', index=False)