# Tutorial 1 - AIRBNB - CORE STEPS

**Our unit of analysis is an AIRBNB LISTING**

We will see how we can transform the input variables. We won't do any predictions in this notebook!

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [None]:
#We will predict the "median_house_value" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

In [None]:
# Find the total number of rows

airbnb.shape

In [None]:
# Check the missing values

airbnb.isna().sum()

### Should we remove these rows or not???

In [None]:
# If we want to remove them, use the following code:

# train.dropna(axis=0, inplace=True)

# Split data (train/test)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(airbnb, test_size=0.3)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head()

In [None]:
test.head()

# Prepare the data

In [None]:
# Descriptive statistics of numerical variables

train.describe()

In [None]:
# Total missing values in each column

train.isna().sum()

## Separate the POTENTIAL target columns. Separate numerical and categorical inputs

In [None]:
train_targets = train[['price', 'price_gte_150', 'price_category']]

train_numeric_columns = train[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']]

train_binary_columns = train[['host_is_superhost', 'host_identity_verified']]

train_categorical_columns = train[['neighbourhood_cleansed', 'property_type', 
                                   'room_type', 'bed_type', 'cancellation_policy']]

In [None]:
train_numeric_columns.head()

In [None]:
train_binary_columns.head()

In [None]:
train_categorical_columns.head()

## Process the numerical variables

### Imputation 

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
train_numeric_columns_imputed = imputer.fit_transform(train_numeric_columns)

In [None]:
train_numeric_columns_imputed

### Standardize the values


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_numeric_columns_std = scaler.fit_transform(train_numeric_columns_imputed)

train_numeric_columns_std

### Convert back to Pandas

In [None]:
train_numeric_columns_std_df = pd.DataFrame(train_numeric_columns_std, 
                                      columns=train_numeric_columns.columns).reset_index(drop=True)

train_numeric_columns_std_df.head()

In [None]:
train_numeric_columns_std_df.isna().sum()

## Process the categorical variables

In [None]:
#Find the total number of missing values
train_categorical_columns.isna().sum()

In [None]:
train_categorical_columns['property_type'].value_counts()

In [None]:
#Find the rows that have missing values
train_categorical_columns[train_categorical_columns.isnull().any(axis=1)]

In [None]:
#Impute "unknown" or for categorical text values

categorical_imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')

train_categorical_columns_imputed = categorical_imputer.fit_transform(train_categorical_columns)

### Convert back to Pandas

In [None]:
train_categorical_columns_imputed_df = pd.DataFrame(train_categorical_columns_imputed, 
                                      columns=train_categorical_columns.columns).reset_index(drop=True)

train_categorical_columns_imputed_df.head()

In [None]:
train_categorical_columns_imputed_df['property_type'].value_counts()

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

train_categorical_columns_1hot = cat_encoder.fit_transform(train_categorical_columns_imputed_df)

train_categorical_columns_1hot

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [None]:
train_categorical_columns_1hot.toarray()

In [None]:
cat_encoder.categories_

In [None]:
#Let's flatten the array of arrays to get the column names

onehot_column_names = [item for sublist in cat_encoder.categories_ for item in sublist]

onehot_column_names

### Convert back to Pandas

In [None]:
train_categorical_columns_1hot_df = pd.DataFrame(train_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

train_categorical_columns_1hot_df.head()

## Do not process the binary variables

## Concatenate all variables

In [None]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

train_prepared = pd.concat((train_numeric_columns_std_df.reset_index(drop=True), 
                             train_categorical_columns_1hot_df.reset_index(drop=True),
                             train_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

train_prepared.shape

In [None]:
train_prepared.head()

# Process the Test data using "Transform" only

In [None]:
test_targets = test[['price', 'price_gte_150', 'price_category']]

test_numeric_columns = test[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']]

test_binary_columns = test[['host_is_superhost', 'host_identity_verified']]

test_categorical_columns = test[['neighbourhood_cleansed', 'property_type', 
                                 'room_type', 'bed_type', 'cancellation_policy']]

## Process numerical variables - test

### Imputation 

In [None]:
#Transform only

test_numeric_columns_imputed = imputer.transform(test_numeric_columns)

In [None]:
test_numeric_columns_imputed

### Standardize the values


In [None]:
test_numeric_columns_std = scaler.transform(test_numeric_columns_imputed)

test_numeric_columns_std

### Convert back to Pandas

In [None]:
test_numeric_columns_std_df = pd.DataFrame(test_numeric_columns_std, 
                                      columns=test_numeric_columns.columns).reset_index(drop=True)

test_numeric_columns_std_df.head()

In [None]:
test_numeric_columns_std_df.isna().sum()

## Process the categorical variables - test

In [None]:
#Find the total number of missing values
test_categorical_columns.isna().sum()

In [None]:
#Impute "unknown" or for categorical text values

test_categorical_columns_imputed = categorical_imputer.transform(test_categorical_columns)

### Convert back to Pandas

In [None]:
test_categorical_columns_imputed_df = pd.DataFrame(test_categorical_columns_imputed, 
                                      columns=test_categorical_columns.columns).reset_index(drop=True)

test_categorical_columns_imputed_df.head()

In [None]:
test_categorical_columns_imputed_df['property_type'].value_counts()

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [None]:
test_categorical_columns_1hot = cat_encoder.transform(test_categorical_columns_imputed_df)

test_categorical_columns_1hot

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [None]:
test_categorical_columns_1hot.toarray()

In [None]:
#One hot column names are still the same

onehot_column_names

### Convert back to Pandas

In [None]:
test_categorical_columns_1hot_df = pd.DataFrame(test_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

test_categorical_columns_1hot_df.head()

## Do not transform the binary variables - test

## Concatenate all variables - test

In [None]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

test_prepared = pd.concat((test_numeric_columns_std_df.reset_index(drop=True), 
                           test_categorical_columns_1hot_df.reset_index(drop=True),
                           test_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

test_prepared.shape

In [None]:
test_prepared.head()

## What we didn't do:

Visualization<br>
Feature engineering<br>
Modeling