In [244]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Baseline SK Learn Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

from scipy.stats import skew

# Import OS module
import os

In [245]:
# view current directory
originalDirectory = os.getcwd()
#rootDirectory = originalDirectory[:-9]
rootDirectory = "C:/Users/ydupis/Documents/Data-Science/Competition/AirBnbPrediction/"

#Change Directory to data/raw
os.chdir("{}data".format(rootDirectory))
print(os.getcwd())

C:\Users\ydupis\Documents\Data-Science\Competition\AirBnbPrediction\data


In [246]:
# Load dataset for exloratory analysis
#Define file name
train_filename = 'raw/train.csv'
test_filename = 'raw/test.csv'
sample_filename = 'raw/sample_submission.csv'
zillow_filename = 'processed/zillow_ready.csv'

#Read CSV file
train_detail =  pd.read_csv(train_filename)
test_detail =  pd.read_csv(test_filename)
sample_detail=  pd.read_csv(sample_filename)
zillow=  pd.read_csv(zillow_filename)

In [247]:
print("There are {} rows and {} columns in train dataset".format(train_detail.shape[0],train_detail.shape[1]))
print("There are {} rows and {} columns in test dataset".format(test_detail.shape[0],test_detail.shape[1]))

There are 74111 rows and 29 columns in train dataset
There are 25458 rows and 28 columns in test dataset


### Missing Value

In [248]:
## Missing Value analysis
def missing_value_analysis(df):
    list_null_num = len(df.index) - df.count()
    list_null_perc = ((len(df.index) - df.count()) / df.shape[0]) * 100
    list_null_num = list_null_num.sort_values(ascending=False)
    list_null_perc = list_null_perc.sort_values(ascending=False)

    missing_df = pd.DataFrame({'Missing Count':list_null_num,
                             'Missing Percentage(%)':list_null_perc})
    missing_df.index.name = 'Feature Names'
    missing_df.reset_index(inplace=True)
    missing_df_final = missing_df[missing_df['Missing Count'] > 0]
    return missing_df_final

In [249]:
missing_value_analysis(train_detail)

Unnamed: 0,Feature Names,Missing Count,Missing Percentage(%)
0,host_response_rate,18299,24.691341
1,review_scores_rating,16722,22.563452
2,first_review,15864,21.405729
3,last_review,15827,21.355804
4,thumbnail_url,8216,11.086074
5,neighbourhood,6872,9.272578
6,zipcode,966,1.30345
7,bathrooms,200,0.269865
8,host_identity_verified,188,0.253674
9,host_since,188,0.253674


### Missing Value Imputation

In [250]:

def feature_engineering(data):
    
    #numeric_feats = data.dtypes[data.dtypes != "object"].index
    #numeric_to_exlude = ['id','latitude','longitude','log_price']
    #numeric_feats_final = [el for el in numeric_feats if el not in numeric_to_exlude]
    #numeric_feats_final = list(set(numeric_feats) - set(numeric_to_exlude))
    #print(numeric_feats_final)
    #print(numeric_feats_final)

    #skewed_feats = data[numeric_feats_final].apply(lambda x: skew(x.dropna())) #compute skewness
    #skewed_feats = skewed_feats[skewed_feats > 0.75]
    #skewed_feats = skewed_feats.index

    #data[skewed_feats] = np.log1p(data[skewed_feats])

    ## Initializing the imputer 
    imp_mean = Imputer(missing_values= 'NaN', strategy= 'mean', axis= 0)
    imp_median = Imputer(missing_values= 'NaN', strategy= 'median', axis= 0)
    imp_most_frequent = Imputer(missing_values= 'NaN', strategy= 'most_frequent', axis= 0)

    ## Imputing Host_response_rate
    # convert formatting for host_response_rate, removing % signs
    data['host_response_rate'] = (data['host_response_rate'].str.replace(r'[^-+\d.]', '').astype(float))
    data['host_response_rate'] = imp_median.fit_transform(data[['host_response_rate']])

    ## Imputing review_scores_rating
    data['review_scores_rating'] = imp_mean.fit_transform(data[['review_scores_rating']]).astype(int)

    ## replace NaN by 'NO Date' and treat the missing values correctly
    data['neighbourhood'] = data['neighbourhood'].replace(np.NaN,'not provided')
    data['zipcode'] = data['zipcode'].replace(np.NaN,'not provided')

    ## Imputing Bathrooms and bedrooms variables
    data['bathrooms'] = imp_median.fit_transform(data[['bathrooms']]).astype(int)
    data['bedrooms'] = imp_median.fit_transform(data[['bedrooms']]).astype(int)
    data['beds'] = imp_median.fit_transform(data[['beds']]).astype(int)

    ## Most frequent 
    data['host_identity_verified'] = data['host_identity_verified'].replace(['t','f'],[1,0])
    host_identity_verified_median = data["host_identity_verified"].median()
    data['host_identity_verified'] = data['host_identity_verified'].replace([np.NaN],[host_identity_verified_median]).astype(int)

    data['host_has_profile_pic'] = data['host_has_profile_pic'].replace(['t','f'],[1,0])
    host_has_profile_pic_median = data["host_has_profile_pic"].median()
    data['host_has_profile_pic'] = data['host_has_profile_pic'].replace([np.NaN],[host_has_profile_pic_median]).astype(int)
    
    # Count the number of aminities 
    data['amenities_count'] = data['amenities'].apply(lambda x: len(x.split()))
    
    data['name'] = data['name'].replace([np.NaN],["Not Provided"])
    
    # Count the number of words in the description
    data['description'] = data['description'].replace([np.NaN],["Not Provided"])
    data['description_length'] = data['description'].apply(lambda x: len(x.split()))
    
    # Replace True and False with 1 and 0
    data['cleaning_fee'] = data['cleaning_fee'].apply(lambda x: 1 if x==True else 0)
    
    # Replcase existing thumbnail by 1 and missig thumbnail by 0
    data['thumbnail_url'].loc[data['thumbnail_url'].notnull()] = 1
    data['thumbnail_url'].loc[data['thumbnail_url'].isnull()] = 0
    
    # Extract year from first_review field. Missing first_review date replaces by 9999
    # For this variable we could create a dummy variable
    data['first_review'].loc[data['first_review'].isnull()] = "9999"
    #data['first_review'] = data['first_review'].apply(lambda x: x[:4])
    
    data['last_review'].loc[data['last_review'].isnull()] = "9999"
    #data['last_review'] = data['last_review'].apply(lambda x: x[:4])
    
    data['host_since'].loc[data['host_since'].isnull()] = "9999"
    #data['host_since'] = data['host_since'].apply(lambda x: x[:4])

    return data

In [251]:
train_detail = feature_engineering(train_detail)
test_detail = feature_engineering(test_detail)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### Merge Zillow data

In [252]:
# Merge Zillow external data
zillow['zipcode']= zillow['zipcode'].astype('str')

train_detail = train_detail.merge(zillow,how='left', on = ['zipcode','city'])
# Fill missing with mean by city
train_detail["zhvi"] = train_detail[['city','zhvi']].groupby("city").transform(lambda x: x.fillna(x.mean()))

test_detail = test_detail.merge(zillow,how='left', on = ['zipcode','city'])
# Fill missing with mean by city
test_detail["zhvi"] = test_detail[['city','zhvi']].groupby("city").transform(lambda x: x.fillna(x.mean()))

In [253]:
test.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds,amenities_count,description_length,zhvi
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",1.386294,0,Real Bed,strict,1,...,Brooklyn Heights,1.098612,100,1,11201.0,0,0,7,31,14.032958
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",2.079442,0,Real Bed,strict,1,...,Hell's Kitchen,1.94591,93,1,10019.0,1,1,9,168,14.062527
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",1.791759,0,Real Bed,moderate,1,...,Harlem,2.397895,92,1,10027.0,0,1,15,167,13.814285
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",1.609438,0,Real Bed,flexible,1,...,Lower Haight,0.0,94,1,94117.0,1,1,11,78,14.142922
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.098612,0,Real Bed,moderate,1,...,Columbia Heights,1.609438,40,0,20009.0,0,0,9,115,13.20744


### Create one column by amenities

In [254]:
amenities_list = []
for i in range(len(train_detail)):
    amenities_list += train_detail.amenities[i][1:-1].split(',')

In [255]:
amenities_list = list(set(amenities_list))

In [256]:
amenities_list_dict_train = {}
amenities_list_dict_test = {}

for a in amenities_list:
    amenities_list_dict_train[a] = []
    amenities_list_dict_test[a] = []

In [257]:
#amenities_list_dict

In [258]:
for i in range(len(train_detail)):
    l = train_detail.amenities[i][1:-1].split(',')
    for a in amenities_list_dict_train.keys():
        if a in l:
            amenities_list_dict_train[a].append(1)
        else:
            amenities_list_dict_train[a].append(0)

In [259]:
amenities_table_train = pd.DataFrame(amenities_list_dict_train)
amenities_table_train = amenities_table_train.drop([''],axis=1)

In [260]:
for i in range(len(test_detail)):
    l = test_detail.amenities[i][1:-1].split(',')
    for a in amenities_list_dict.keys():
        if a in l:
            amenities_list_dict_test[a].append(1)
        else:
            amenities_list_dict_test[a].append(0)

In [261]:
amenities_table_test = pd.DataFrame(amenities_list_dict_test)
amenities_table_test = amenities_table_test.drop([''],axis=1)

In [262]:
amenities_table_test.head()

Unnamed: 0,"smooth pathway to front door""","""24-hour check-in""","""Accessible-height bed""","""Accessible-height toilet""","""Air conditioning""","""Air purifier""","""BBQ grill""","""Baby bath""","""Baby monitor""","""Babysitter recommendations""",...,Other,Oven,Pool,Refrigerator,Shampoo,Smartlock,Stove,TV,Washer,Waterfront
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [263]:
train_detail = pd.concat([train_detail, amenities_table_train], axis=1)

In [264]:
test_detail = pd.concat([test_detail, amenities_table_test], axis=1)

In [265]:
missing_value_analysis(train_detail)

Unnamed: 0,Feature Names,Missing Count,Missing Percentage(%)
0,zhvi,3468,4.679467


In [266]:
## Missing Value analysis
missing_value_analysis(test_detail)

Unnamed: 0,Feature Names,Missing Count,Missing Percentage(%)
0,zhvi,1175,4.615445


In [267]:
print("There are {} rows and {} columns in cleansed train dataset".format(train_detail.shape[0],train_detail.shape[1]))

There are 74111 rows and 162 columns in cleansed train dataset


In [268]:
print("There are {} rows and {} columns in cleansed test dataset".format(test_detail.shape[0],test_detail.shape[1]))

There are 25458 rows and 161 columns in cleansed test dataset


In [269]:
## Saving Cleansed data to Interim folder
#Change Directory to data/raw
os.chdir("{}data/interim".format(rootDirectory))
print(os.getcwd())

to_write_train = 'train_preprocessed_interim.csv'
to_write_test = 'test_preprocessed_interim.csv'

train_detail.to_csv(to_write_train, encoding='utf-8')
test_detail.to_csv(to_write_test, encoding='utf-8')

C:\Users\ydupis\Documents\Data-Science\Competition\AirBnbPrediction\data\interim
