In [162]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Neural Network Model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor

# Baseline SK Learn Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Import OS module
import os


In [163]:
# view current directory
originalDirectory = os.getcwd()
rootDirectory = originalDirectory[:-8]

#Change Directory to data/raw
os.chdir("{}data/raw".format(rootDirectory))
print(os.getcwd())

# Change working directory back to orignal location
# os.chdir(originalDirectory)
# print(os.getcwd())


C:\Users\ydupis\Documents\Data-Science\Competition\AirBnbPrediction\data\raw


In [164]:
# Load dataset for exloratory analysis

#Define file name
train_filename = 'train.csv'
test_filename = 'test.csv'
sample_filename = 'sample_submission.csv'


#Read CSV file
train_detail =  pd.read_csv(train_filename)
test_detail =  pd.read_csv(test_filename)
sample_detail=  pd.read_csv(sample_filename)

In [165]:
train_detail.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [166]:
def feature_engineering(data):
    
    # Count the number of aminities 
    data['amenities_count'] = data['amenities'].apply(lambda x: len(x.split()))
    
    # Count the number of words in the description
    data['description_length'] = data['description'].apply(lambda x: len(x.split()))
    
    # Replace True and False with 1 and 0
    data['cleaning_fee'] = data['cleaning_fee'].apply(lambda x: 1 if x==True else 0)
    
    # Replcase existing thumbnail by 1 and missig thumbnail by 0
    data['thumbnail_url'].loc[data['thumbnail_url'].notnull()] = 1
    data['thumbnail_url'].loc[data['thumbnail_url'].isnull()] = 0
    
    # Extract year from first_review field. Missing first_review date replaces by 9999
    # For this variable we could create a dummy variable
    data['first_review'].loc[data['first_review'].isnull()] = "9999"
    data['first_review'] = data['first_review'].apply(lambda x: x[:4])
    
    return data
    

In [167]:
train_processed = feature_engineering(train_detail)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [168]:
train_processed.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds,amenities_count,description_length
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,1,...,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,1,11201.0,1.0,1.0,7,31
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,1,...,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,1,10019.0,3.0,3.0,9,168
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,1,...,The Garden Oasis,Harlem,10,92.0,1,10027.0,1.0,3.0,15,167
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,1,...,Beautiful Flat in the Heart of SF!,Lower Haight,0,,1,94117.0,2.0,2.0,11,78
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,1,...,Great studio in midtown DC,Columbia Heights,4,40.0,0,20009.0,0.0,1.0,9,115
