In [1]:
# data managing and display libs
import pandas as pd
import numpy as np
import os
import io

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 

# sagemaker libraries
import boto3

In [20]:
df_airbnb = pd.read_csv('airbnb_clean.csv')

In [21]:
drop_cols = list(df_airbnb.select_dtypes(['O']).columns) + ['id', 'latitude', 'longitude']
drop_cols

['name',
 'summary',
 'description',
 'host_verifications',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'amenities',
 'id',
 'latitude',
 'longitude']

In [22]:
df_airbnb.drop(drop_cols, axis=1, inplace=True)

In [23]:
# convert all these to float64
df_airbnb = df_airbnb.astype(float)

In [24]:
df_airbnb.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45605 entries, 0 to 45604
Data columns (total 57 columns):
host_listings_count                                45605 non-null float64
host_total_listings_count                          45605 non-null float64
accommodates                                       45605 non-null float64
bathrooms                                          45605 non-null float64
bedrooms                                           45605 non-null float64
beds                                               45605 non-null float64
price                                              45605 non-null float64
guests_included                                    45605 non-null float64
extra_people                                       45605 non-null float64
minimum_nights                                     45605 non-null float64
maximum_nights                                     45605 non-null float64
availability_30                                    45605 non-null float64
ava

In [25]:
df_airbnb.head()

Unnamed: 0,host_listings_count,host_total_listings_count,accommodates,bathrooms,bedrooms,beds,price,guests_included,extra_people,minimum_nights,...,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,require_guest_profile_picture_f,require_guest_profile_picture_t,require_guest_phone_verification_f,require_guest_phone_verification_t
0,5.0,5.0,2.0,1.0,0.0,1.0,225.0,2.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,1.0,2.0,1.0,1.0,1.0,150.0,2.0,20.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,80.0,1.0,20.0,10.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1.0,1.0,2.0,1.0,1.0,1.0,200.0,2.0,100.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,1.0,2.0,1.0,1.0,1.0,60.0,1.0,30.0,45.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


## Get the merged_df

In [26]:
merged_df = pd.read_csv('merged_df.csv')

In [27]:
merged_df.head()

Unnamed: 0,amenities_tv,amenities_wifi,amenities_air_conditioning,amenities_kitchen,amenities_paid_parking_off_premises,amenities_free_street_parking,amenities_indoor_fireplace,amenities_heating,amenities_family/kid_friendly,amenities_smoke_detector,...,description_contains_yankee,description_contains_yard,description_contains_year,description_contains_yellow,description_contains_yoga,description_contains_york,description_contains_young,description_contains_yummy,description_contains_zero,description_contains_zone
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.137645,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.074083,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.18384,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Concatenate both

In [28]:
final_df = pd.concat([df_airbnb, merged_df], axis='columns')

In [29]:
final_df.shape

(45605, 2208)

In [30]:
final_df.head()

Unnamed: 0,host_listings_count,host_total_listings_count,accommodates,bathrooms,bedrooms,beds,price,guests_included,extra_people,minimum_nights,...,description_contains_yankee,description_contains_yard,description_contains_year,description_contains_yellow,description_contains_yoga,description_contains_york,description_contains_young,description_contains_yummy,description_contains_zero,description_contains_zone
0,5.0,5.0,2.0,1.0,0.0,1.0,225.0,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,2.0,1.0,1.0,1.0,150.0,2.0,20.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,80.0,1.0,20.0,10.0,...,0.0,0.0,0.0,0.0,0.137645,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,2.0,1.0,1.0,1.0,200.0,2.0,100.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.074083,0.0,0.0,0.0,0.0
4,1.0,1.0,2.0,1.0,1.0,1.0,60.0,1.0,30.0,45.0,...,0.0,0.0,0.18384,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Use RobustScaler
Since each of the features have different units of measurement, we will use a RobustScaler. The MinMaxScaler is suseptible to outliers. We have seen that price has many outliers, so for that reason we will go with RobustScaler.

In [31]:
from sklearn.preprocessing import RobustScaler

In [32]:
scaler = RobustScaler()

In [33]:
scaled_final_df = scaler.fit_transform(final_df)

In [36]:
scaled_final_df = pd.DataFrame(scaled_final_df, columns=final_df.columns)

In [37]:
scaled_final_df.head()

Unnamed: 0,host_listings_count,host_total_listings_count,accommodates,bathrooms,bedrooms,beds,price,guests_included,extra_people,minimum_nights,...,description_contains_yankee,description_contains_yard,description_contains_year,description_contains_yellow,description_contains_yoga,description_contains_york,description_contains_young,description_contains_yummy,description_contains_zero,description_contains_zone
0,4.0,4.0,0.0,0.0,-1.0,0.0,1.157407,1.0,0.0,-0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.462963,1.0,0.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,-0.5,0.0,0.0,0.0,-0.185185,0.0,0.8,1.75,...,0.0,0.0,0.0,0.0,0.137645,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.925926,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.074083,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,-0.37037,0.0,1.2,10.5,...,0.0,0.0,0.18384,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
scaled_final_df.shape

(45605, 2208)

In [39]:
scaled_final_df.to_csv('scaled_final_df.csv', index=False)

In [40]:
# boto3 client to get S3 data
s3_client = boto3.client('s3')
bucket_name='skuchkula-sagemaker-airbnb'

In [41]:
# upload it to S3
s3_client.upload_file(Bucket=bucket_name, 
                      Filename='scaled_final_df.csv', 
                      Key='feature_eng/scaled_final_df.csv')