## Turn text columns into numeric features
**Feature Engineering**: In this notebook, we will create new features from the cleaned dataset. The following are the goals of this step:
- For the amenities column, create a bag-of-words representation.
- For the host verifications column, create a bag-of-words representation.
- For the description column, create a TF-IDF representation.
- Merge this into one dataframe

In [2]:
# data managing and display libs
import pandas as pd
import numpy as np
import os
import io

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 

# sagemaker libraries
import boto3
import sagemaker

In [3]:
# boto3 client to get S3 data
s3_client = boto3.client('s3')
bucket_name='skuchkula-sagemaker-airbnb'

In [29]:
# list the bucket objects
response = s3_client.list_objects(Bucket=bucket_name)

# get list of objects inside the bucket
files = [file['Key'] for file in response['Contents']]
files

['clean/airbnb_clean.csv', 'detailed_listings.csv', 'summary_listings.csv']

In [4]:
airbnb_file = files[0]

In [6]:
# download the file from s3
def get_data_frame(bucket_name, file_name):
    '''
    Takes the location of the dataset on S3 and returns a dataframe.
    arguments:
            bucket_name: the name of the bucket
            file_name: the key inside the bucket
    returns:
            dataframe
    '''
    # get an S3 object by passing in the bucket and file name
    data_object = s3_client.get_object(Bucket=bucket_name, Key=file_name)
    
    # information is in the "Body" of the object
    data_body = data_object["Body"].read()
    
    # read in bytes data
    data_stream = io.BytesIO(data_body)
    
    # create a dataframe
    df = pd.read_csv(data_stream, header=0, delimiter=",", low_memory=False, keep_default_na=False)
    
    return df

In [6]:
df_airbnb = get_data_frame(bucket_name, airbnb_file)

In [7]:
df_airbnb.shape

(45605, 67)

### Create features from amenities values

In [8]:
df_airbnb.amenities.head()

0    {TV,Wifi,"Air conditioning",Kitchen,"Paid park...
1    {"Cable TV",Internet,Wifi,"Air conditioning",K...
2    {Internet,Wifi,"Air conditioning",Kitchen,Elev...
3    {TV,"Cable TV",Internet,Wifi,Kitchen,"Buzzer/w...
4    {Wifi,"Air conditioning",Kitchen,"Pets live on...
Name: amenities, dtype: object

In [9]:
# remove the curly brackets
df_airbnb['amenities'] =  df_airbnb['amenities'].apply(lambda x: x[1:-1])

In [10]:
df_airbnb.amenities.head()

0    TV,Wifi,"Air conditioning",Kitchen,"Paid parki...
1    "Cable TV",Internet,Wifi,"Air conditioning",Ki...
2    Internet,Wifi,"Air conditioning",Kitchen,Eleva...
3    TV,"Cable TV",Internet,Wifi,Kitchen,"Buzzer/wi...
4    Wifi,"Air conditioning",Kitchen,"Pets live on ...
Name: amenities, dtype: object

In [11]:
amenities = df_airbnb.amenities

In [12]:
# create a dictionary of terms
amenities_idx ={}
idx = 0
corpus = []
for i in range(len(amenities)):
    items = amenities[i]
    items_lower = items.lower()
    tokens = items_lower.split(',')
    corpus.append(tokens)
    for token in tokens:
        if token not in amenities_idx:
            amenities_idx[token] = idx
            idx += 1

In [13]:
print("Total number of terms in the corpus: ", len(amenities_idx))
print("Total number of documents in the corpus: ", len(corpus))

Total number of terms in the corpus:  131
Total number of documents in the corpus:  45605


In [14]:
# Get the number of items and tokens 
M = len(amenities)
N = len(amenities_idx)

# Initialize a matrix of zeros
A = np.zeros((M, N))

In [15]:
# Define the amenity_encoder function
def amenity_encoder(tokens):
    x = np.zeros(N)
    for token in tokens:
        # Get the index for each amenity
        idx = amenities_idx[token]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

In [16]:
# Make a document-term matrix
i = 0
for tokens in corpus:
    A[i, :] = amenity_encoder(tokens)
    i = i + 1

In [17]:
A.shape

(45605, 131)

In [18]:
type(A)

numpy.ndarray

In [65]:
amenities_features = pd.DataFrame(A, columns=list(amenities_idx.keys()))
amenities_features.head()

Unnamed: 0,tv,wifi,"""air conditioning""",kitchen,"""paid parking off premises""","""free street parking""","""indoor fireplace""",heating,"""family/kid friendly""","""smoke detector""",...,"""lake access""","""pool with pool hoist""","""full kitchen""","""electric profiling bed""","""ground floor access""","""air purifier""","""mobile hoist""",kitchenette,"""fixed grab bars for shower","""ceiling hoist"""
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
# create a csv file and store it in S3
amenities_features.to_csv('amenities_features.csv', index=False)

In [74]:
# upload it to S3
s3_client.upload_file(Bucket=bucket_name, 
                      Filename='amenities_features.csv', 
                      Key='feature_eng/amenities_features.csv')

## Create features from host_verifications

In [20]:
df_airbnb.host_verifications[:10]

0    ['email', 'phone', 'reviews', 'kba', 'work_ema...
1    ['email', 'phone', 'google', 'reviews', 'jumio...
2     ['email', 'phone', 'facebook', 'reviews', 'kba']
3    ['email', 'phone', 'reviews', 'jumio', 'govern...
4    ['email', 'phone', 'facebook', 'reviews', 'off...
5            ['email', 'phone', 'facebook', 'reviews']
6    ['email', 'phone', 'facebook', 'google', 'revi...
7                 ['email', 'phone', 'reviews', 'kba']
8    ['email', 'phone', 'manual_online', 'reviews',...
9    ['email', 'phone', 'reviews', 'jumio', 'govern...
Name: host_verifications, dtype: object

In [21]:
import re
re.findall(r'\w+', df_airbnb.host_verifications[0])

['email', 'phone', 'reviews', 'kba', 'work_email']

In [22]:
df_airbnb.loc[:, 'host_verifications'] = df_airbnb.host_verifications.apply(lambda x: re.findall(r'\w+', x))

In [23]:
verifications = df_airbnb.host_verifications

In [24]:
verification_idx = {}
idx = 0
corpus = []
for i in range(len(verifications)):
    items = verifications[i]
    corpus.append(items)
    for item in items:
        if item not in verification_idx:
            verification_idx[item] = idx
            idx += 1

In [25]:
verification_idx

{'email': 0,
 'phone': 1,
 'reviews': 2,
 'kba': 3,
 'work_email': 4,
 'google': 5,
 'jumio': 6,
 'government_id': 7,
 'facebook': 8,
 'offline_government_id': 9,
 'selfie': 10,
 'identity_manual': 11,
 'manual_online': 12,
 'sent_id': 13,
 'manual_offline': 14,
 'None': 15,
 'weibo': 16,
 'sesame': 17,
 'sesame_offline': 18,
 'zhima_selfie': 19}

In [26]:
print("Total number of terms in the corpus: ", len(verification_idx))
print("Total number of documents in the corpus: ", len(corpus))

Total number of terms in the corpus:  20
Total number of documents in the corpus:  45605


In [27]:
# Get the number of items and tokens 
M = len(verifications)
N = len(verification_idx)

# Initialize a matrix of zeros
B = np.zeros((M, N))

In [28]:
# Define the verification_encoder function
def verification_encoder(tokens):
    x = np.zeros(N)
    for token in tokens:
        # Get the index for each verification
        idx = verification_idx[token]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

In [29]:
# Make a document-term matrix
i = 0
for tokens in corpus:
    B[i, :] = verification_encoder(tokens)
    i = i + 1

In [30]:
B.shape

(45605, 20)

In [59]:
type(B)

numpy.ndarray

In [61]:
list(verification_idx.keys())

['email',
 'phone',
 'reviews',
 'kba',
 'work_email',
 'google',
 'jumio',
 'government_id',
 'facebook',
 'offline_government_id',
 'selfie',
 'identity_manual',
 'manual_online',
 'sent_id',
 'manual_offline',
 'None',
 'weibo',
 'sesame',
 'sesame_offline',
 'zhima_selfie']

In [62]:
verification_features = pd.DataFrame(B, columns=list(verification_idx.keys()))

In [None]:
verification_features.columns = verification_features.add_prefix('host_verification_by_').columns

In [63]:
verification_features.shape

(45605, 20)

In [72]:
verification_features.head()

Unnamed: 0,host_verification_by_email,host_verification_by_phone,host_verification_by_reviews,host_verification_by_kba,host_verification_by_work_email,host_verification_by_google,host_verification_by_jumio,host_verification_by_government_id,host_verification_by_facebook,host_verification_by_offline_government_id,host_verification_by_selfie,host_verification_by_identity_manual,host_verification_by_manual_online,host_verification_by_sent_id,host_verification_by_manual_offline,host_verification_by_None,host_verification_by_weibo,host_verification_by_sesame,host_verification_by_sesame_offline,host_verification_by_zhima_selfie
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
# create a csv file and store it in S3
verification_features.to_csv('host_verification_features.csv', index=False)

In [76]:
# upload it to S3
s3_client.upload_file(Bucket=bucket_name, 
                      Filename='host_verification_features.csv', 
                      Key='feature_eng/host_verification_features.csv')

## Use a CountVectorizer for all the text columns

In [8]:
# Set the display properties so that we can inspect the data
pd.set_option("display.max_colwidth", 1000)

In [9]:
TEXT_COLUMNS = ['description', 'summary']
df_airbnb[TEXT_COLUMNS].head()

Unnamed: 0,description,summary
0,"Find your romantic getaway to this beautiful, spacious skylit studio in the heart of Midtown, Manhattan. STUNNING SKYLIT STUDIO / 1 BED + SINGLE / FULL BATH / FULL KITCHEN / FIREPLACE / CENTRALLY LOCATED / WiFi + APPLE TV / SHEETS + TOWELS - Spacious (500+ft²), immaculate and nicely furnished & designed studio. - Tuck yourself into the ultra comfortable bed under the skylight. Fall in love with a myriad of bright lights in the city night sky. - Single-sized bed/convertible floor mattress with luxury bedding (available upon request). - Gorgeous pyramid skylight with amazing diffused natural light, stunning architectural details, soaring high vaulted ceilings, exposed brick, wood burning fireplace, floor seating area with natural zafu cushions, modern style mixed with eclectic art & antique treasures, large full bath, newly renovated kitchen, air conditioning/heat, high speed WiFi Internet, and Apple TV. - Centrally located in the heart of Midtown Manhattan just a few blocks from a...","Find your romantic getaway to this beautiful, spacious skylit studio in the heart of Midtown, Manhattan. STUNNING SKYLIT STUDIO / 1 BED + SINGLE / FULL BATH / FULL KITCHEN / FIREPLACE / CENTRALLY LOCATED / WiFi + APPLE TV / SHEETS + TOWELS"
1,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY This Spacious 1 bedroom is with Plenty of Windows with a View....... Sleeps.....Four Adults.....two in the Livingrm. with (2) Sofa-beds. (Website hidden by Airbnb) two in the Bedrm.on a very Comfortable Queen Size Bed... A Complete Bathrm.....With Shower and Bathtub....... Fully Equipped with Linens & Towels........ Spacious Living Room......Flat ScreenTelevision.....DVD Player with Movies available for your viewing during your stay............................................................................. Dining Area.....for Morning Coffee or Tea..................................................... The Kitchen Area is Modern with Granite Counter Top... includes the use of a Coffee Maker...Microwave to Heat up a Carry Out/In Meal.... Not suited for a Gourmet Cook...or Top Chef......Sorry!!!! . This Flat is located in HISTORIC HARLEM.... near the Appollo Theater and The Museum Mile...on Fifth Avenue. Sylvia's World Famous Resturant......,
2,"Loft apartment with high ceiling and wood flooring located 10 minutes away from Central Park in Harlem - 1 block away from 6 train and 3 blocks from 2 & 3 line. This is in a recently renovated building which includes elevator, trash shoot. marble entrance and laundromat in the basement. The apartment is a spacious loft studio. The seating area and sleeping area is divided by a bookcase. There is a long hallway entrance where the bathroom and closet for your clothes is situated. The apartment is in mint condition, the walls have been freshly painted a few months ago. Supermarket, and 24 hour convenience store less than 1 block away. 1 block away from Hot Yoga Studio and NY Sports club facility. Perfect for anyone wanting to stay in Manhattan but get more space. 10 minutes away from midtown and 15 minutes away from downtown. The neighborhood is lively and diverse. You will need to travel at least 10 blocks to find cafe's, restaurants etc.. There are a few restaurants on 100 stree...",
3,"My large 1 bedroom apartment is true New York City living. The apt is in midtown on the east side and centrally located, just a 10-minute walk from Grand Central Station, Empire State Building, Times Square. The kitchen and living room are large and bright with Apple TV. I have a new Queen Bed that sleeps 2 people, and a Queen Aero Bed that can sleep 2 people in the living room. The apartment is located on the 5th floor of a walk up - no elevator (lift). I have a large 1 bedroom apartment centrally located in Midtown East. A 10 minute walk from Grand Central Station, Times Square, Empire State Building and all major subway and bus lines. The apartment is located on the 5th floor of a pre-war walk up building-no elevator/lift. The apartment is bright with has high ceilings and flow through rooms. A spacious, cozy living room with Netflix and Apple TV. A large bright kitchen to sit and enjoy coffee or tea. The bedroom is spacious with a comfortable queen size bed that sleeps 2. ...","My large 1 bedroom apartment is true New York City living. The apt is in midtown on the east side and centrally located, just a 10-minute walk from Grand Central Station, Empire State Building, Times Square. The kitchen and living room are large and bright with Apple TV. I have a new Queen Bed that sleeps 2 people, and a Queen Aero Bed that can sleep 2 people in the living room. The apartment is located on the 5th floor of a walk up - no elevator (lift)."
4,"HELLO EVERYONE AND THANKS FOR VISITING BLISS ART SPACE! Thank you all for your support. I've traveled a lot in the last year few years, to the U.K. Germany, Italy and France! Loved Paris, Berlin and Calabria! Highly recommend all these places. One room available for rent in a 2 bedroom apt in Bklyn. We share a common space with kitchen. I am an artist(painter, filmmaker) and curator who is working in the film industry while I'm building my art event production businesses. Price above is nightly for one person. Monthly rates available. Price is $900 per month for one person. Utilities not included, they are about 50 bucks, payable when the bill arrives mid month. Couples rates are slightly more for monthly and 90$ per night short term. If you are a couple please Iet me know and I’ll give you the monthly rate for that. Room rental is on a temporary basis, perfect from 2- 6 months - no long term requests please! At the moment I AM ONLY TAKING BOOKINGS OF AT LEAST ONE OR ONE AND ...",


In [12]:
# use descriptions column
descriptions = list(df_airbnb.description)

# send this list of descriptions through my nlp pipeline
clean_descriptions = nlp_pipeline(descriptions)

In [15]:
clean_descriptions = None
df_airbnb = None

In [13]:
# rejoin the tokens to form strings which will be used to vectorize
clean_descriptions_text = [' '.join(item) for item in clean_descriptions]

clean_descriptions_text[:10]

['find romantic getaway beautiful spacious studio heart manhattan stun studio single full bath full kitchen fireplace centrally locate wifi apple sheet towel spacious immaculate nicely furnish design studio tuck ultra comfortable skylight fall love myriad bright light city night single size floor mattress luxury bedding available request gorgeous pyramid skylight amaze diffuse natural light stun architectural detail soaring high vault ceiling expose brick wood burning fireplace floor seating area natural cushion modern style eclectic antique treasure large full bath newly renovate kitchen high speed wifi internet apple centrally locate heart manhattan block',
 'welcome international urban community spacious plenty window view sofa website hide comfortable queen size complete shower bathtub fully equip linen towel spacious living player movie available viewing stay dining morning coffee kitchen area modern granite counter include coffee heat carry meal suit gourmet flat locate historic 

## Vectorize the corpus

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=0.95, max_features=2000,
                                   ngram_range=(1,1), stop_words='english')

tfidf_feature_matrix = tfidf_vectorizer.fit_transform(clean_descriptions_text)

tfidf_feature_matrix.shape

(45605, 2000)

In [17]:
type(tfidf_feature_matrix)

scipy.sparse.csr.csr_matrix

In [18]:
display(tfidf_vectorizer.get_feature_names()[:10])
display(tfidf_vectorizer.get_feature_names()[-10:])

['able',
 'abode',
 'abound',
 'absolute',
 'absolutely',
 'abundance',
 'abundant',
 'academy',
 'accent',
 'accept']

['yankee',
 'yard',
 'year',
 'yellow',
 'yoga',
 'york',
 'young',
 'yummy',
 'zero',
 'zone']

In [21]:
# create a dataframe from feature matrix
feature_matrix_df = pd.DataFrame(tfidf_feature_matrix.toarray(), 
                                 columns=tfidf_vectorizer.get_feature_names())

feature_matrix_df.head()

Unnamed: 0,able,abode,abound,absolute,absolutely,abundance,abundant,academy,accent,accept,...,yankee,yard,year,yellow,yoga,york,young,yummy,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137645,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.074083,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.18384,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
feature_matrix_df.shape

(45605, 2000)

In [24]:
# create a csv file and store it in S3
feature_matrix_df.to_csv('description_features.csv', index=False)

In [25]:
# upload it to S3
s3_client.upload_file(Bucket=bucket_name, 
                      Filename='description_features.csv', 
                      Key='feature_eng/description_features.csv')

## Merge all the dataframes

In [4]:
# list the bucket objects
response = s3_client.list_objects(Bucket=bucket_name)

# get list of objects inside the bucket
files = [file['Key'] for file in response['Contents']]
files

['clean/airbnb_clean.csv',
 'detailed_listings.csv',
 'feature_eng/amenities_features.csv',
 'feature_eng/description_features.csv',
 'feature_eng/host_verification_features.csv',
 'summary_listings.csv']

In [None]:
amenities_df = get_data_frame(bucket_name, 'feature_eng/amenities_features.csv')
host_verification_df = get_data_frame(bucket_name, 'feature_eng/host_verification_features.csv')
description_df = get_data_frame(bucket_name, 'feature_eng/description_features.csv')

In [None]:
merged_df = pd.concat([amenities_df, host_verification_df, description_df], axis='columns')

## Appendix

In [119]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [11]:
import re
import pandas as pd
import numpy as np
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

# define some common lingo
custom_stopwords = ['bedroom', 'bathroom', 'apartment']

def remove_hypens(book_text):
    return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', book_text)

# tokenize text
def tokenize_text(book_text):
    TOKEN_PATTERN = r'\s+'
    regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True)
    word_tokens = regex_wt.tokenize(book_text)
    return word_tokens

def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) 
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens]) 
    return filtered_tokens

def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def remove_stopwords(tokens, custom_stopwords):
    stopword_list = nltk.corpus.stopwords.words('english')
    stopword_list += custom_stopwords
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

def get_lemma(tokens):
    lemmas = []
    for word in tokens:
        lemma = wn.morphy(word)
        if lemma is None:
            lemmas.append(word)
        else:
            lemmas.append(lemma)
    return lemmas

def remove_short_tokens(tokens):
    return [token for token in tokens if len(token) > 3]

def keep_only_words_in_wordnet(tokens):
    return [token for token in tokens if wn.synsets(token)]

def apply_lemmatize(tokens, wnl=WordNetLemmatizer()):
    return [wnl.lemmatize(token) for token in tokens]

# I like to think of each row of text as a book
# input to this function is a list of books
def nlp_pipeline(book_texts):
    clean_books = []
    for book in book_texts:
        book = remove_hypens(book)
        book_i = tokenize_text(book)
        book_i = remove_characters_after_tokenization(book_i)
        book_i = convert_to_lowercase(book_i)
        book_i = remove_stopwords(book_i, custom_stopwords)
        book_i = get_lemma(book_i)
        book_i = remove_short_tokens(book_i)
        book_i = keep_only_words_in_wordnet(book_i)
        book_i = apply_lemmatize(book_i)
        clean_books.append(book_i)
    return clean_books