In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import luxury_calculator

# reload luxury_calculator to be correctly updated 
importlib.reload(luxury_calculator)

# import the updated luxury_calculator
from luxury_calculator import SimpleLuxuryCalculator


In [96]:
# Load the data
# get path to files
path = '/Users/simenguttormsen/Dropbox/SimenDuke/CompSci671/KaggleProject/cs-671-fall-2024-final-project/'

df_train = pd.read_csv(path + 'train.csv', parse_dates=['host_since', 'first_review', 'last_review'])
y_train = df_train["price"]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(df_train, y_train, test_size=0.2, random_state=1)
X_train = X_train.drop(columns=["price"])
X_test = X_test.drop(columns=["price"])

X_competition = pd.read_csv(path + 'test.csv', parse_dates=['host_since', 'first_review', 'last_review'])

In [16]:
# get index of duplicates and remove from y_train
# duplicates_index = X_train[X_train.duplicated()].index
# duplicates_index

# y_train = y_train.drop(duplicates_index)

In [70]:
def calculate_luxury_metrics(amenities_list):
    calculator = SimpleLuxuryCalculator()
    if isinstance(amenities_list, list):
        score = calculator.calculate_score(amenities_list)
        level = calculator.get_luxury_level(score)
        return {'luxury_score': score, 'luxury_level': level}
    return {'luxury_score': 0, 'luxury_level': 'Standard'}

In [71]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def calculate_sentiment_score(review_text):
    sentiment_scores = []
    for review in review_text:
        try: 
            sid = SentimentIntensityAnalyzer()
            sentiment = sid.polarity_scores(review)
            if sentiment['compound'] != 0:
                sentiment_scores.append(sentiment['compound'])
        except:
            continue
    # some sentiment scores might be empty
    if len(sentiment_scores) == 0:
        return np.nan
    return np.median(sentiment_scores)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/simenguttormsen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [72]:
def process_data_train(df_train, y_train):
    # Remove duplicate rows from train (but not from test data)
    duplicates = df_train.duplicated(keep='first')
    df_train = df_train.loc[~duplicates]
    y_train = y_train.loc[df_train.index]

    # fix amenities
    df_train['amenities'] = df_train['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))

    # Apply the calculator to DataFrame
    luxury_results = df_train['amenities'].apply(calculate_luxury_metrics)
    # Create new columns for luxury score and level
    df_train['luxury_score'] = luxury_results.apply(lambda x: x['luxury_score']) 

    # add number of amenities as a feature
    df_train["amenities_length"] = df_train["amenities"].apply(len)

    # extract number in bathrooms_text
    df_train["num_bathrooms"] = df_train['bathrooms_text'].str.extract(r'(\d+(?:\.\d+)?)', expand=False)
    df_train['private_bathroom'] = (df_train['bathrooms_text'].str.contains('shared') == False)*1

    # host_response_time should not be one-hot encoded, but should be converted to a number of days
    df_train["host_response_time"] = df_train["host_response_time"].map({"within an hour": 1, "within a few hours": 2, "within a day": 3, "a few days or more": 4})

    # one-hot encode categorical variables except for property_type. We will deal with it seperately
    df_train = pd.get_dummies(df_train, columns=['neighbourhood_group_cleansed', "host_is_superhost", "room_type", "private_bathroom", "has_availability"])

    # create column if review is missing or not 
    df_train["review_missing"] = df_train["review_scores_rating"].isnull()*1

    # one hot encode property_type
    # first, make an other category
    df_train["property_type"] = df_train["property_type"].apply(lambda x: "other" if df_train["property_type"].value_counts()[x] < 125 else x) # approx 1% of the training data
    df_train = pd.get_dummies(df_train, columns=['property_type'])

    df_train["neighbourhood_cleansed"] = df_train["neighbourhood_cleansed"].apply(lambda x: "other" if df_train["neighbourhood_cleansed"].value_counts()[x] < 250 else x) # approx 2% of the training data
    df_train = pd.get_dummies(df_train, columns=['neighbourhood_cleansed'])

    # convert host_since to number of days hosted
    df_train['host_since'] = (pd.Timestamp.now() - df_train['host_since']).dt.days
    df_train['first_review'] = (pd.Timestamp.now() - df_train['first_review']).dt.days
    df_train['last_review'] = (pd.Timestamp.now() - df_train['last_review']).dt.days          

    # Split reviews into list of reviews
    review_split = df_train["reviews"].fillna("nan").apply(lambda x: x.split("\n---------------------------------\n") if isinstance(x, str) else ["nan"])
    df_train['sentiment_score'] = review_split.apply(lambda x: calculate_sentiment_score(x))
    
    # remove neighborhood cleansed, host verifications, bathrooms_text, amenities, name, description, reviews
    df_train = df_train.drop(columns=["host_verifications", "bathrooms_text", "amenities", "name", "description", "reviews"])

    return df_train, y_train


In [81]:
def process_data_competition(df_train, df_train_processed, df_competition):
    id = False
    # if there exist an id column:
    if "id" in df_competition.columns:
        # save id
        df_competition_id = df_competition["id"]
        df_competition = df_competition.drop(columns=["id"])
        id = True

    # fix amenities
    df_competition['amenities'] = df_competition['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))

    # Apply the calculator to DataFrame
    luxury_results = df_competition['amenities'].apply(calculate_luxury_metrics)
    # Create new columns for luxury score and level
    df_competition['luxury_score'] = luxury_results.apply(lambda x: x['luxury_score']) 

    df_competition["amenities_length"] = df_competition["amenities"].apply(len)

    # extract number in bathrooms_text
    df_competition["num_bathrooms"] = df_competition['bathrooms_text'].str.extract(r'(\d+(?:\.\d+)?)', expand=False)
    df_competition['private_bathroom'] = (df_competition['bathrooms_text'].str.contains('shared') == False)*1

    # host_response_time should not be one-hot encoded, but should be converted to a number of days
    df_competition["host_response_time"] = df_competition["host_response_time"].map({"within an hour": 1, "within a few hours": 2, "within a day": 3, "a few days or more": 4})

    # one-hot encode categorical variables except for property_type. We will deal with it seperately
    df_competition = pd.get_dummies(df_competition, columns=['neighbourhood_group_cleansed', "host_is_superhost", "room_type", "private_bathroom", "has_availability"])

     # create column if review is missing or not 
    df_competition["review_missing"] = df_competition["review_scores_rating"].isnull()*1

    # one hot encode property_type
    # first, make an other category
    # make others category based on the columns in df_train that have fewer than 125 instances 
    # first, if there exist a category in df_competition that is not in df_train, group it as "other"
    # get list of property types that have less than 125 instances
    properties_category_other = df_train["property_type"].value_counts()[df_train["property_type"].value_counts() < 125].index.tolist()
    all_property_type_train = set(df_train["property_type"])
    # if there exist a category in df_competition that is not in df_train, group it as "other"
    df_competition["property_type"] = df_competition["property_type"].apply(lambda x: "other" if x not in all_property_type_train else x)
    # also group all categories in properties_category_other in properties_category_other as "other"
    df_competition["property_type"] = df_competition["property_type"].apply(lambda x: "other" if x in properties_category_other else x)  



    neighborhood_category_other = df_train["neighbourhood_cleansed"].value_counts()[df_train["neighbourhood_cleansed"].value_counts() < 250].index.tolist()
    all_neighborhood_train = set(df_train["neighbourhood_cleansed"])
    # if there exist a category in df_competition that is not in df_train, group it as "other"
    df_competition["neighbourhood_cleansed"] = df_competition["neighbourhood_cleansed"].apply(lambda x: "other" if x not in all_neighborhood_train else x)
    # also group all categories in properties_category_other in properties_category_other as "other"
    df_competition["neighbourhood_cleansed"] = df_competition["neighbourhood_cleansed"].apply(lambda x: "other" if x in neighborhood_category_other else x)  
    
    
    # One hot encode neighborhood cleansed and property type
    df_competition = pd.get_dummies(df_competition, columns=['property_type', "neighbourhood_cleansed"])

    
    # convert host_since to number of days hosted
    df_competition['host_since'] = (pd.Timestamp.now() - df_competition['host_since']).dt.days
    df_competition['first_review'] = (pd.Timestamp.now() - df_competition['first_review']).dt.days
    df_competition['last_review'] = (pd.Timestamp.now() - df_competition['last_review']).dt.days          

    # Split reviews into list of reviews
    review_split = df_competition["reviews"].fillna("nan").apply(lambda x: x.split("\n---------------------------------\n") if isinstance(x, str) else ["nan"])
    df_competition['sentiment_score'] = review_split.apply(lambda x: calculate_sentiment_score(x))
    
    # remove neighborhood cleansed, host verifications, bathrooms_text, amenities, name, description, reviews
    df_competition = df_competition.drop(columns=["host_verifications", "bathrooms_text", "amenities", "name", "description", "reviews"])
    
    # this will ensure we have the same columns in both training and testing data
    # make column with column set that exist in df_train_processed and not in df_competition, and fill with 0s
    columns_not_in_competition = set(df_train_processed.columns) - set(df_competition.columns)
    for column in columns_not_in_competition:
        df_competition[column] = False

    if id == True:
        df_competition["id"] = df_competition_id

        
    return df_competition


In [74]:
X_train_processed, y_train_processed = process_data_train(X_train, y_train)
print("finished processing train data")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['amenities'] = df_train['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['luxury_score'] = luxury_results.apply(lambda x: x['luxury_score'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

finished processing train data


In [84]:
X_test_processed = process_data_competition(X_train, X_train_processed, X_test)
print("finished processing test data")

finished processing test data


In [85]:
X_competition_processed = process_data_competition(X_train, X_train_processed, X_competition)
print("finished processing competition data")

finished processing competition data


In [95]:
X_train_processed.to_csv(path + 'X_train_processed.csv', index=False)
X_test_processed.to_csv(path + 'X_test_processed.csv', index=False)
X_competition_processed.to_csv(path + 'X_competition_processed.csv', index=False)
y_train_processed.to_csv(path + 'y_train_processed.csv', index=False)
y_test.to_csv(path + 'y_test.csv', index=False)


In [28]:
# load X and X_competition from X_competition with SentimentIntensityAnalyzer
# X = pd.read_csv(path + 'X_train_with_sentiment.csv')
# X_competition = pd.read_csv(path + 'X_competition_with_sentiment.csv')


Unnamed: 0,latitude,longitude,host_since,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,calculated_host_listings_count,...,property_type_Room in aparthotel,property_type_Room in boutique hotel,property_type_Room in hotel,property_type_Shared room in home,property_type_Shared room in rental unit,property_type_other,sentiment_score,property_type_Private room,property_type_Private room in resort,id
0,40.744620,-73.904520,4210,,,1.0,12.0,True,True,1,...,False,False,False,False,False,False,0.92360,False,False,3917
1,40.753407,-73.934995,4479,99.0,23.0,727.0,1336.0,True,True,719,...,False,False,False,False,False,False,,False,False,1885
2,40.677090,-73.943810,4568,,,1.0,1.0,True,True,1,...,False,False,False,False,False,False,0.93820,False,False,1305
3,40.795760,-73.971570,1320,70.0,37.0,36.0,79.0,True,True,36,...,False,False,False,False,False,False,0.93265,False,False,19328
4,40.713590,-73.955400,3692,100.0,75.0,1.0,1.0,True,True,1,...,False,False,False,False,False,False,0.93195,False,False,16511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6722,40.637960,-73.951360,3352,100.0,100.0,2.0,2.0,True,True,2,...,False,False,False,False,False,False,0.89320,False,False,7205
6723,40.823720,-73.945460,3165,0.0,33.0,7.0,8.0,True,False,7,...,False,False,False,False,False,False,0.92130,False,False,3954
6724,40.755094,-73.937260,4479,99.0,23.0,727.0,1336.0,True,True,719,...,False,False,False,False,False,False,,False,False,1358
6725,40.781580,-73.984780,4691,100.0,100.0,1.0,3.0,True,True,1,...,False,False,False,False,False,False,0.88420,False,False,2793


In [47]:
# load X and X_competition from X_competition with SentimentIntensityAnalyzer
X_sentiment = pd.read_csv(path + 'X_train_with_sentiment.csv')
X_competition_sentiment = pd.read_csv(path + 'X_competition_with_sentiment.csv')

sentiment_scores = X_sentiment["sentiment_score"]
sentiment_scores_competition = X_competition_sentiment["sentiment_score"]

