## Data Preparation and Feature Engineering

### 1. Importing the libraries

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
import os
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import spacy
from scipy import sparse

### 2. Loading the data

In [3]:
dataset= pd.read_json('train.json')
dataset.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,low
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,high
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,low
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,low


In [4]:
dataset['interest_level'].value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [5]:
###Checking the percentages of points belonging to the three classes present in the dataset
dataset['interest_level'].value_counts()/ len(dataset['interest_level'])

low       0.694683
medium    0.227529
high      0.077788
Name: interest_level, dtype: float64

### 3. Splitting the data into train and test

In [6]:
train, test= train_test_split(dataset, test_size=.3)

In [7]:
###Checking the no of points belonging to each of the classes in the train dataset
train['interest_level'].value_counts()

low       24035
medium     7879
high       2632
Name: interest_level, dtype: int64

In [8]:
###Checking the percentage of points given to each of the classes for the train dataset
train['interest_level'].value_counts()/len(train)

low       0.695739
medium    0.228073
high      0.076188
Name: interest_level, dtype: float64

In [9]:
###Checking the no of points belonging to each of the classes in the test dataset
test['interest_level'].value_counts()

low       10249
medium     3350
high       1207
Name: interest_level, dtype: int64

In [10]:
###Checking the percentage of points given to each of the classes in the test dataset
test['interest_level'].value_counts()/len(test)

low       0.692219
medium    0.226260
high      0.081521
Name: interest_level, dtype: float64

### 3. Data preparation

#### Engineered features for created date

In [11]:
##Utility function
def get_days_months_year(dt):
    try:
        return pd.Series([datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').day,datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').month,\
                          datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').year])
    except ValueError:
        return pd.Series([0,0,0])

Extracting month, day and year from the date feature

In [12]:
##Day, Month and year feature for the train dataset
train[['day', 'month', 'year']] = train['created'].apply(get_days_months_year)

In [13]:
##Day, Month and year feature for the test dataset
test[['day', 'month', 'year']] = test['created'].apply(get_days_months_year)

#### Utility function for response coding

In [14]:
def create_response_dict(dataframe, feature, class_):
    data= dataframe[[feature, class_]]
    dict_= {}
    for i in data[feature].unique():
        vals= []
        for j in data[class_].unique():
            vals.append((len(data[(data[feature] == i) & (data[class_] ==j )]))/ len(data[data[feature] == i]))
        dict_[i] = [k * len(data[data[feature] == i]) for k in vals]
    return dict_

In [15]:
def transform_response_coding(feature, dict_):
    if feature in dict_.keys():
        return pd.Series(dict_[feature])
    else:
        return pd.Series([1/3, 1/3,1/3])

#### Performing response coding for building id

In [16]:
start= datetime.now()
dict_building_id= create_response_dict(dataframe=train, feature= 'building_id', class_='interest_level')
print('Time taken {}'.format(datetime.now() - start))

Time taken 0:03:26.345983


In [17]:
##Response coding for the train dataset
train[['building_id_0','building_id_1','building_id_2']] = train['building_id'].apply(transform_response_coding, dict_= dict_building_id)

In [18]:
##Response coding for the test dataset
test[['building_id_0','building_id_1','building_id_2']] = test['building_id'].apply(transform_response_coding, dict_= dict_building_id)

#### Performing response coding for manager id

In [19]:
start= datetime.now()
dict_manager_id= create_response_dict(dataframe=train, feature= 'manager_id', class_='interest_level')
print('Time taken {}'.format(datetime.now() - start))

Time taken 0:01:43.400994


In [20]:
train[['manager_id_0','manager_id_1','manager_id_2']] = train['manager_id'].apply(transform_response_coding, dict_= dict_manager_id)

In [21]:
test[['manager_id_0','manager_id_1','manager_id_2']] = test['manager_id'].apply(transform_response_coding, dict_= dict_manager_id)

#### Utility function for text cleaning

In [22]:
stop_eng= stopwords.words('english')
stemmr= SnowballStemmer('english')
def clean_data(text, stop_and_stem=True, just_alpha= False, is_list= False,):
    ## joining a list
    if is_list:
        text= ' '.join(text)
    
    ## Removing the html tags
    soup = BeautifulSoup(text)
    text= soup.get_text()
    
    ##Removing the special characters
    if just_alpha:
        text = re.sub('[^a-zA-Z ]',' ', text )
    else:
        text = re.sub('[^a-zA-Z0-9 ]',' ', text )
    
    ##Lowercasing, stemming and removing the stopwords
    text= word_tokenize(text)
    
    
    if stop_and_stem == True:
        text= ' '.join([stemmr.stem(i.lower()) for i in text if i not in stop_eng])
    
    else:
        text= ' '.join(list(map(lambda x:x.lower(),text)))
        
    return text

#### One hot encoding for features variable

In [23]:
count_vect_feat= CountVectorizer(min_df=3)
##Train
count_vect_features_train= count_vect_feat.fit_transform(train['features'].\
                                                   apply(clean_data, stop_and_stem= False, just_alpha=True, is_list=True))

In [24]:
##Test
count_vect_features_test = count_vect_feat.transform(test['features'].\
                                                   apply(clean_data, stop_and_stem= False, just_alpha=True, is_list=True))

In [25]:
len(count_vect_feat.vocabulary_)

417

#### Adding a new featue as - Length of the feature column

In [26]:
##Train
train['len_features']= train['features'].apply(lambda x: len(x))

In [27]:
##Test
test['len_features']= test['features'].apply(lambda x: len(x))

#### One hot encoding the Display address

In [28]:
train['display_address'][:10].apply(clean_data, stop_and_stem= False, just_alpha=False, is_list=False)

16285                         w 151 street
8514                          worth street
43025                     south 4th street
33847    east 29th street lexington avenue
64052                       central park w
64394                            w 70th st
31868                     east 75th street
97560                            gates ave
92520               avenue of the americas
96530                              2nd ave
Name: display_address, dtype: object

In [29]:
count_vect_displ= CountVectorizer(min_df=5)
##Train
count_vect_display_address_train= count_vect_displ.fit_transform(train['display_address'].apply\
                                                   (clean_data, stop_and_stem= False, just_alpha=False, is_list=False))

In [30]:
##Test
count_vect_display_address_test= count_vect_displ.transform(test['display_address'].apply\
                                                   (clean_data, stop_and_stem= False, just_alpha=False, is_list=False))

In [31]:
len(count_vect_displ.vocabulary_)

879

#### One hot encoding the Street address

In [32]:
train['street_address'][:10].apply(clean_data, stop_and_stem= False, just_alpha=False, is_list=False)

16285              609 w 151 street
8514               111 worth street
43025          120 south 4th street
33847          145 east 29th street
64052            241 central park w
64394                 200 w 70th st
31868          300 east 75th street
97560                1670 gates ave
92520    777 avenue of the americas
96530                   530 2nd ave
Name: street_address, dtype: object

In [33]:
count_vect_strt= CountVectorizer(min_df=5)
count_vect_strt_address_train= count_vect_strt.fit_transform(train['street_address'].apply\
                                                   (clean_data, stop_and_stem= False, just_alpha=False, is_list=False))

In [34]:
count_vect_strt_address_test= count_vect_strt.transform(test['street_address'].apply\
                                                   (clean_data, stop_and_stem= False, just_alpha=False, is_list=False))

In [35]:
len(count_vect_strt.vocabulary_)

1239

#### Cleaning the description feature

In [36]:
train['cleaned_description']= train['description'].apply(clean_data, stop_and_stem= True, just_alpha=False, is_list=False)

In [37]:
test['cleaned_description']= test['description'].apply(clean_data, stop_and_stem= True, just_alpha=False, is_list=False)

#### Bag of Words vectorization of the description feature

In [38]:
count_vect_desc= CountVectorizer(min_df=5)
count_vect_description_train= count_vect_desc.fit_transform(train['cleaned_description'])

In [39]:
count_vect_description_test= count_vect_desc.transform(test['cleaned_description'])

In [40]:
count_vect_description_train.shape

(34546, 8771)

#### Tfidf vectorization of the description feature

In [41]:
tfidf_desc= TfidfVectorizer(min_df=5)
tfidf_description_train= tfidf_desc.fit_transform(train['cleaned_description'])

In [42]:
tfidf_description_test= tfidf_desc.transform(test['cleaned_description'])

In [43]:
tfidf_description_train.shape

(34546, 8771)

#### Avg W2V vectorization of the description feature

In [44]:
###Creating a list of sentences
i=0
list_of_sent=[]
for sent in train['cleaned_description'].values:
    list_of_sent.append(sent.split())

In [45]:
w2v_model = Word2Vec(list_of_sent, size=60,workers=8)
w2v_words= list(w2v_model.wv.vocab)

In [46]:
sent_to_vect=[]
for sublist in tqdm(list_of_sent):
    vector= np.zeros(60)
    cnt= 0
    for word in sublist:
        if word in w2v_words:
            vector += w2v_model.wv[word]
            cnt += 1
    if cnt >0:
        sent_to_vect.append(vector/cnt) 
    else:
        sent_to_vect.append(vector)

100%|██████████| 34546/34546 [00:35<00:00, 974.21it/s] 


In [47]:
sent_to_vec_avgw2v_train= np.asarray(sent_to_vect)

In [48]:
###Creating a list of sentences for the test data
i=0
list_of_sent_test=[]
for sent in test['cleaned_description'].values:
    list_of_sent_test.append(sent.split())

In [49]:
sent_to_vect_test=[]
for sublist in tqdm(list_of_sent_test):
    vector= np.zeros(60)
    cnt= 0
    for word in sublist:
        if word in w2v_words:
            vector += w2v_model.wv[word]
            cnt += 1
    if cnt >0:
        sent_to_vect_test.append(vector/cnt)
    else:
        sent_to_vect_test.append(vector)

100%|██████████| 14806/14806 [00:15<00:00, 981.98it/s] 


In [50]:
sent_to_vec_avgw2v_test= np.asarray(sent_to_vect_test)

#### Tfidf W2V vectorization of the description feature

In [51]:
tfidf= TfidfVectorizer(tokenizer=lambda x:x.split())
tfidf_vect= tfidf.fit_transform(train['cleaned_description'])
dictionary = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))

In [52]:
sent_to_vect=[]
for sublist in tqdm(list_of_sent):
    vector= np.zeros(60)
    tf_idf= 0
    for word in sublist:
        if word in w2v_words:
            tf_idf_word= sublist.count(word) * dictionary[word]
            vector += (w2v_model.wv[word] * tf_idf_word)
            tf_idf += tf_idf_word
    if tf_idf >0:
        sent_to_vect.append(vector/tf_idf)
    else:
        sent_to_vect.append(vector)

100%|██████████| 34546/34546 [00:50<00:00, 684.16it/s]


In [53]:
sent_to_vec_tfidfw2v= np.asarray(sent_to_vect)

In [54]:
sent_to_vect_test=[]
for sublist in tqdm(list_of_sent_test):
    vector= np.zeros(60)
    tf_idf= 0
    for word in sublist:
        if word in w2v_words:
            tf_idf_word= sublist.count(word) * dictionary[word]
            vector += (w2v_model.wv[word] * tf_idf_word)
            tf_idf += tf_idf_word
    if tf_idf >0:
        sent_to_vect_test.append(vector/tf_idf)
    else:
        sent_to_vect_test.append(vector)

100%|██████████| 14806/14806 [00:21<00:00, 676.38it/s]


In [55]:
sent_to_vec_tfidfw2v_test= np.asarray(sent_to_vect_test)

#### Using Spacy's Word2Vec to featurize the description feature

In [55]:
nlp = spacy.load("en_core_web_sm")

In [103]:
spacy_vectors=[]
for document in tqdm(train['cleaned_description'][:]):
    token= nlp(document)
    spacy_vectors.append(token.vector)

100%|██████████| 34546/34546 [05:32<00:00, 103.74it/s]


In [104]:
sent_to_vec_spacy = np.asarray(spacy_vectors)

In [68]:
spacy_vectors=[]
for document in tqdm(test['cleaned_description']):
    token= nlp(document)
    spacy_vectors.append(token.vector.tolist())

100%|██████████| 14806/14806 [02:25<00:00, 101.92it/s]


In [69]:
sent_to_vec_spacy_test = np.asarray(spacy_vectors) 

### 4. Preparation of the datasets for modelling

In [56]:
dataset.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,low


In [57]:
train.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level',
       'day', 'month', 'year', 'building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2',
       'len_features', 'cleaned_description'],
      dtype='object')

### 1. With description as BOW

In [58]:
train_bow= sparse.hstack((train[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_train,\
               count_vect_display_address_train, count_vect_strt_address_train, count_vect_description_train
               ))

In [59]:
train_bow.shape

(34546, 11321)

In [60]:
test_bow = sparse.hstack((test[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_test,\
               count_vect_display_address_test, count_vect_strt_address_test, count_vect_description_test
               ))

In [61]:
test_bow.shape

(14806, 11321)

### 2.  With description as TFIDF

In [62]:
train_tfidf= sparse.hstack((train[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_train,\
               count_vect_display_address_train, count_vect_strt_address_train, tfidf_description_train
               ))

In [63]:
train_tfidf.shape

(34546, 11321)

In [64]:
test_tfidf = sparse.hstack((test[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_test,\
               count_vect_display_address_test, count_vect_strt_address_test, tfidf_description_test
               ))

In [65]:
test_tfidf.shape

(14806, 11321)

### 3.  With description as AvgW2V

In [66]:
train_avgw2v= sparse.hstack((train[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_train,\
               count_vect_display_address_train, count_vect_strt_address_train, sent_to_vec_avgw2v_train
               ))

In [67]:
train_avgw2v.shape

(34546, 2610)

In [68]:
test_avgw2v= sparse.hstack((test[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_test,\
               count_vect_display_address_test, count_vect_strt_address_test, sent_to_vec_avgw2v_test
               ))

In [69]:
test_avgw2v.shape

(14806, 2610)

### 4.   With description as W2V using TFIDFW2V

In [70]:
train_tfidfw2v = sparse.hstack((train[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_train,\
               count_vect_display_address_train, count_vect_strt_address_train, sent_to_vec_tfidfw2v
               ))

In [71]:
train_tfidfw2v.shape

(34546, 2610)

In [72]:
test_tfidfw2v= sparse.hstack((test[['bathrooms', 'bedrooms', 'latitude', 'longitude','price','day', 'month', 'year','building_id_0', 'building_id_1',
       'building_id_2', 'manager_id_0', 'manager_id_1', 'manager_id_2' ,'len_features']],count_vect_features_test,\
               count_vect_display_address_test, count_vect_strt_address_test, sent_to_vec_tfidfw2v_test
               ))

In [73]:
test_tfidfw2v.shape

(14806, 2610)

### 5. Pickling the datsets

In [79]:
##Pickling the train data
import pickle

with open('./pickle/train_bow.pkl', 'wb') as f:
    pickle.dump(train_bow,f)

with open('./pickle/train_tfidf.pkl', 'wb') as f:
    pickle.dump(train_tfidf,f)
    
with open('./pickle/train_avgw2v.pkl', 'wb') as f:
    pickle.dump(train_avgw2v,f)
    
with open('./pickle/train_tfidfw2v.pkl', 'wb') as f:
    pickle.dump(train_tfidfw2v,f)
    
with open('./pickle/y_train.pkl', 'wb') as f:
    pickle.dump(train['interest_level'],f)

In [78]:
##Pickling the test data

with open('./pickle/test_bow.pkl', 'wb') as f:
    pickle.dump(test_bow,f)

with open('./pickle/test_tfidf.pkl', 'wb') as f:
    pickle.dump(test_tfidf,f)
    
with open('./pickle/test_avgw2v.pkl', 'wb') as f:
    pickle.dump(test_avgw2v,f)
    
with open('./pickle/test_tfidfw2v.pkl', 'wb') as f:
    pickle.dump(test_tfidfw2v,f)
    
with open('./pickle/y_test.pkl', 'wb') as f:
    pickle.dump(test['interest_level'],f)