In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
def clean(df):
    to_drop = ['listing_url', 'scrape_id', 'last_scraped', 'source', 'host_id', 'host_url','host_name','host_thumbnail_url','host_picture_url','neighbourhood_group_cleansed','bathrooms','minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','calendar_updated','host_neighbourhood','neighbourhood','license']
    df = df.drop(to_drop, axis = 1) #drop unnecessary/redundant columns
    
    df['description']= df['description'].str.replace(r'<[^<>]*>', '', regex=True) #cleaning
    df['host_about']= df['host_about'].str.replace('\r\n','')             # descriptions from
    df['host_about']= df['host_about'].str.replace(r'<[^<>]*>', '', regex=True) #html tags
    
    to_datetime = ['host_since', 'calendar_last_scraped', 'first_review', 'last_review']
    for col in to_datetime:
        df[col] = pd.to_datetime(df[col])
    
    df['host_verifications'] = df['host_verifications'].apply(lambda x: x.strip('][').split(', '))
    df['num_verifications'] = df['host_verifications'].apply(lambda x:len(x))
    df['amenities'] = df['amenities'].apply(lambda x: x.strip('][').split(', '))
    df['num_amenities'] = df['amenities'].apply(lambda x:len(x))
    
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    df['min_price'] = df['price'] * df['minimum_nights']
    df['max_price'] = df['price'] * df['maximum_nights']
    
    boolean = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']
    for col in boolean:
            df[col] = df[col].apply(lambda x: True if (x == "t") else False)
    
    df.replace("N/A", None)
    
    NA_to_empty = ['neighborhood_overview', 'host_about', 'description']
    for col in NA_to_empty:
            df[col] = df[col].replace(np.nan, '', regex=True)
    
    imputate_scores = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 
                       'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                      'review_scores_value', 'reviews_per_month']
    for col in imputate_scores:
        df[col].fillna(value=df[col].mean(), inplace=True)
    
    df["neighborhood_overview_wordcount"] = df["neighborhood_overview"].apply(lambda x: len(x.split(" ")))
    df["host_about_wordcount"] = df["host_about"].apply(lambda x: len(x.split(" ")))
    df["description_wordcount"] = df["description"].apply(lambda x: len(x.split(" ")))

    df = df.replace(
    ['within an hour', 'within a few hours', 'within a day', 'a few days or more'], 
    [0, 1, 2, 3])
    df['host_response_time'] = df['host_response_time'].replace(np.nan, df['host_response_time'].mode().values[0], regex=True)
    
    df["price"] = df["price"].astype(float)
    df['host_response_rate'] = (df['host_response_rate'].replace('[\%,]', '', regex=True).astype(float))/100
    df['host_acceptance_rate'] = (df['host_acceptance_rate'].replace('[\%,]', '', regex=True).astype(float))/100
    df['host_response_rate'].fillna(value=df['host_response_rate'].mean(), inplace=True)
    df['host_acceptance_rate'].fillna(value=df['host_acceptance_rate'].mean(), inplace=True)
    
    df = df.dropna()
    
    return df