In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast
from datetime import date, datetime

def clean_text(def_temp):
    for column in def_temp.columns:
        if def_temp[column].dtype == "object":
            # print(f"Column: {column}")
            for index in range(len(def_temp)):
                values = def_temp.iloc[index][column]
                
                if isinstance(values, (date, datetime)):
                    continue
                elif isinstance(values, list):
                    values = " ".join(values)
                else:
                    if not isinstance(values, str):
                        values_str = str(values) if pd.notna(values) else ''
                    else:
                        values_str = values
                        
                    string_without_tags = re.sub(r'<.*?>', '', values_str)
                    cleaned_string = re.sub(r'[^a-zA-Z0-9\s.,;:!?\'"()&/-]+', '', string_without_tags)
                    values = cleaned_string
    
                def_temp.at[index, column] = values
    return def_temp



def add_State(df):
    for value in df['neighbourhood_cleansed']:
        if value in ol_n['neighbourhood'].values:
            df["City"] = "Oakland"
        elif value in sd_n['neighbourhood'].values:
            df["City"] = "San Diego"
        elif value in sf_n['neighbourhood'].values:
            df["City"] = "San Francisco"
        elif value in la_n['neighbourhood'].values:
            df["City"] = "Los Angeles"
    return df



def safe_literal_eval(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError) as e:
        # Return an empty list or any default value
        print(f"Error in eval: {e} | Skipping value: {value}")
        return []

    

def pre_process(df):
    
    drop_columns = ['scrape_id', 'last_scraped', 'source','host_has_profile_pic', 'host_identity_verified',
                    'minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights','maximum_maximum_nights',
                    'minimum_nights_avg_ntm','calculated_host_listings_count','calculated_host_listings_count_entire_homes',
                    'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','bathrooms_text',
                    'neighbourhood','neighbourhood_group_cleansed', 'host_since', 'host_location', 'host_about','host_response_time',
                    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url','host_picture_url',
                    'host_neighbourhood','host_total_listings_count','host_verifications','host_has_profile_pic',
                    'host_identity_verified','calendar_updated', 'has_availability','availability_30','availability_60','availability_90',
                    'availability_365', 'calendar_last_scraped', 'number_of_reviews','number_of_reviews_ltm',
                    'number_of_reviews_l30d','first_review','last_review','maximum_nights_avg_ntm','review_scores_checkin','license']

    df.drop(drop_columns,axis='columns',inplace=True)
    
    df[['host_listings_count', 'bedrooms', 'beds']] = df[['host_listings_count', 'bedrooms', 'beds']].fillna(0)
    df[['name', 'description', 'neighborhood_overview', 'host_name', 'amenities']] = df[['name', 'description', 'neighborhood_overview', 'host_name', 'amenities']].fillna('')
    df["price"] = df["price"].fillna("-1")
    df.fillna(method='ffill', inplace=True)

    # df["amenities"] = df["amenities"].apply(ast.literal_eval)
    df["amenities"] = df["amenities"].apply(safe_literal_eval)
    
    df["review_scores_rating"].fillna(df['review_scores_rating'].mean(), inplace=True)
    df['review_scores_accuracy'].fillna(df['review_scores_accuracy'].mean(), inplace=True)
    df['review_scores_cleanliness'].fillna(df['review_scores_cleanliness'].mean(), inplace=True)
    df['review_scores_communication'].fillna(df['review_scores_communication'].mean(), inplace=True)
    df['review_scores_location'].fillna(df['review_scores_location'].mean(), inplace=True)
    df['review_scores_value'].fillna(df['review_scores_value'].mean(), inplace=True)
    df['reviews_per_month'].fillna(df['reviews_per_month'].mean(), inplace=True)
    df['bathrooms'].fillna(df['bathrooms'].mean(), inplace=True)
    
    
    df[['host_listings_count', 'bedrooms', 'beds']] = df[['host_listings_count', 'bedrooms', 'beds']].astype(int)

    df["instant_bookable"] = df["instant_bookable"].map({"t":True, "f":False})
    col = ["review_scores_rating", "review_scores_accuracy",
        "review_scores_cleanliness", "review_scores_communication",
        "review_scores_location", "review_scores_value", "reviews_per_month"]
    df[col] = df[col].round(2)
    
    
    x = add_State(clean_text(df))
    # df = clean_text(df)
    
    return df


In [None]:
# # LA = pd.read_csv("LA_listings.csv")
# # OL = pd.read_csv("OL_listings.csv")
# # SD = pd.read_csv("SD_listings.csv")
# # SF = pd.read_csv("SF_listings.csv")
# # ----------------------------------------------------------

# initial =  pd.read_csv("Cleaned_Listings_Generated.csv")

# ol_n = pd.read_csv("OL_Neighbourhoods.csv")
# sd_n = pd.read_csv("SD_Neighbourhoods.csv")
# sf_n = pd.read_csv("SF_Neighbourhoods.csv")
# la_n = pd.read_csv("LA_Neighbourhoods.csv")

# # # LA = pre_process(LA)
# # # OL = pre_process(OL)
# # # SD = pre_process(SD)
# # # SF = pre_process(SF)

# x = pd.read_csv("")          # File to be extracted from User OR New incoming data
# x = pre_process(x)

# del ol_n
# del sd_n
# del sf_n
# del la_n

# df = pd.concat([initial, x]).sort_values(by="id")

# # # del LA
# # # del OL
# # # del SD
# # # del SF
# # # del x

# # del initial
# # df.to_csv("Cleaned_Listings_Generated.csv", index=False, encoding='utf-8', errors='ignore')

# df