In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
pd.set_option('display.max_columns', None)
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
#Importing the data
final_df=pd.DataFrame()
a_df=pd.DataFrame()
b_df=pd.DataFrame()
c_df=pd.DataFrame()

for file_n in os.listdir('listings'):
    if file_n[0]=='.':
        continue
    else:
        add_df = pd.read_csv(f'listings/{file_n}')
        if len(add_df.columns)==106:
            final_df = pd.concat([final_df, add_df], axis=0)
        elif len(add_df.columns)==96:
            a_df = pd.concat([a_df, add_df], axis=0)
        elif len(add_df.columns)==95:
            b_df = pd.concat([b_df, add_df], axis=0)
        else:
            c_df = pd.concat([c_df, add_df], axis=0)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [5]:
#These are the columns not present in 2018 reports. They will be dropped.
not_in_2018 = ['minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights',
       'minimum_nights_avg_ntm','maximum_nights_avg_ntm', 'number_of_reviews_ltm', 
       'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms', 
       'calculated_host_listings_count_shared_rooms']
sf_df = final_df.drop(columns=not_in_2018)
sf_df = pd.concat([sf_df, a_df], axis=0)

#drop one column not in 2017
sf_df = sf_df.drop(columns=['is_business_travel_ready'])
sf_df = pd.concat([sf_df, b_df], axis=0)

#drop 3 columns not in 2016 & Prior
sf_df = sf_df.drop(columns=['access', 'interaction', 'house_rules'])
sf_df = pd.concat([sf_df, c_df], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [6]:
pd.set_option('display.max_rows', 20)
#Converting Existing Data Columns to Dates
sf_df['last_scraped'] = pd.to_datetime(sf_df['last_scraped'])
sf_df['host_since'] = pd.to_datetime(sf_df['host_since'])
#Adding new date features
sf_df['year'] = sf_df['last_scraped'].transform(lambda x: x.year)
sf_df['month-year'] = sf_df['last_scraped'].transform(lambda x: f'{x.month} {x.year}')
sf_df['month'] = sf_df['last_scraped'].transform(lambda x: x.month)
sf_df['day_of_week'] = sf_df['last_scraped'].transform(lambda x: x.dt.dayofweek)
sf_df['day'] = sf_df['last_scraped'].transform(lambda x: x.day)

#Converting All Price Related Columns from Objects to Floats
sf_df['price'] = sf_df['price'].transform(lambda x: float(x.replace(',', '').replace('$', '')))
sf_df['extra_people'] = sf_df['extra_people'].transform(lambda x: float(x.replace(',', '').replace('$', '')))

#Fill NaNs in fee columns with 0's, because no additional fee, then convert
sf_df[['security_deposit','cleaning_fee']] = sf_df[['security_deposit','cleaning_fee']].fillna(int(0))
sf_df['security_deposit'] = sf_df['security_deposit'].transform(lambda x: 0 if x==int(0) else float(x.replace(',', '').replace('$', '')))
sf_df['cleaning_fee'] = sf_df['cleaning_fee'].transform(lambda x: 0 if x==int(0) else float(x.replace(',', '').replace('$', '')))

#Dropping columns that have over 75% null
over_70_null = sf_df.columns[sf_df.isnull().sum()/len(sf_df) > 0.70]
sf_df = sf_df.drop(columns=over_70_null)

#Removing Outliers (0 & over 2000 daily rate)
outlier_thresh=2000
sf_df = sf_df.loc[~((sf_df['price'] == 0) | (sf_df['price'] > outlier_thresh))]

#Filling the NaNs in beds, bathrooms, bedrooms
#Assume that if the tenant has the full apartment, they have a bathroom/bedroom
cond1= (sf_df['room_type']=='Entire home/apt')
sf_df.loc[cond1 & (sf_df['beds']==0), sf_df.columns=='beds'] = 1
sf_df.loc[cond1 & (sf_df['bathrooms']==0), sf_df.columns=='bathrooms'] = 1
#For the rest, just fill na
sf_df[['beds','bathrooms', 'bedrooms']] = sf_df[['beds','bathrooms', 'bedrooms']].fillna(0)

#For review scores, let's just fillin with the average
review_lst = ['review_scores_accuracy', 'review_scores_checkin',
       'review_scores_cleanliness', 'review_scores_communication',
       'review_scores_location', 'review_scores_rating', 'review_scores_value']
for review in review_lst:
    sf_df[review] = sf_df[review].fillna(sf_df[review].mean())

In [7]:
train_df2, test = train_test_split(sf_df, test_size = 0.3, random_state=0)

In [8]:
#Adding in all feature engineering for best performing model with Random Forest
train_df2['amenities'] = train_df2['amenities'].fillna('0')
train_df2['num_amenities'] = train_df2['amenities'].transform(lambda x: len(x.split(',')))
train_df2[['space','summary','description','name']] = train_df2[['space','summary','description','name']].fillna(0)
train_df2['len_space'] = train_df2['space'].transform(lambda x: 0 if x==0 else len(x))
train_df2['len_summary'] = train_df2['summary'].transform(lambda x: 0 if x==0 else len(x))
train_df2['len_description'] = train_df2['description'].transform(lambda x: 0 if x==0 else len(x))
train_df2['len_name'] = train_df2['name'].transform(lambda x: 0 if x==0 else len(x))
train_df2['House'] = train_df2['property_type'].transform(lambda x: 1 if x=='Apartment' else 0)
train_df2['Apartment'] = train_df2['property_type'].transform(lambda x: 1 if x=='House' else 0)
train_df2['Entire_Space'] = train_df2['room_type'].transform(lambda x: 1 if x=='Entire home/apt' else 0)
train_df2['Shared_room'] = train_df2['room_type'].transform(lambda x: 1 if x=='Shared room' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [9]:
features=['accommodates','bathrooms', 'bed_type','bedrooms', 'beds','cleaning_fee',
          'extra_people', 'num_amenities',
          'review_scores_cleanliness','len_space', 'len_summary', 'len_description',
          'len_name', 'review_scores_rating', 'security_deposit', 'month', 
          'year', 'House', 'Apartment', 'Entire_Space','Shared_room']
X9 = train_df2[features]
y9 = train_df2['price'].apply(np.log)
X9 = pd.get_dummies(X9, columns=['bed_type','month'])
X_train, X_val, y_train, y_val = train_test_split(X9, y9, test_size=0.3)

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import TensorBoard
import time

In [16]:
X_train = tf.keras.utils.normalize(X_train, axis=1).values()
y_train = np.array(y_train).values()

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [14]:
model = Sequential()

#Input layer
model.add(Dense(1))
model.add(Activation("relu"))

#Hidden layer
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))

#Output layer
model.add(Dense(1))
model.add(Activation('relu'))

model.compile(loss='mean_squared_error',
             optimizer='adam',
             metrics=['MeanSquaredError'])
model.fit(X_train,y_train, batch_size=32, epochs=10, validation_split=0.1)

ValueError: Please provide as model inputs either a single array or a list of arrays. You passed: inputs=      accommodates  bathrooms  bedrooms      beds  cleaning_fee  extra_people  \
4363      0.001852   0.000463  0.000926  0.000926      0.018523      0.023154   
7552      0.000808   0.000606  0.000404  0.000404      0.026260      0.022220   
7074      0.001602   0.000401  0.000401  0.000801      0.000000      0.006009   
4687      0.000869   0.000434  0.000434  0.000434      0.021714      0.000000   
886       0.001611   0.000604  0.000805  0.001208      0.064422      0.020132   
...            ...        ...       ...       ...           ...           ...   
539       0.000967   0.000484  0.000484  0.000484      0.024184      0.000000   
1630      0.000441   0.000441  0.000441  0.000441      0.017646      0.000000   
5065      0.000490   0.000490  0.000490  0.000490      0.049021      0.049021   
6159      0.002537   0.000634  0.000846  0.001691      0.014798      0.000000   
7323      0.002196   0.001255  0.000941  0.000941      0.056458      0.000000   

      num_amenities  review_scores_cleanliness  len_space  len_summary  \
4363       0.012966                   0.004631   0.000000     0.153277   
7552       0.009696                   0.004040   0.400359     0.100595   
7074       0.008814                   0.004006   0.400614     0.133805   
4687       0.015200                   0.004343   0.056456     0.159379   
886        0.006040                   0.004026   0.361972     0.117168   
...             ...                        ...        ...          ...   
539        0.006772                   0.004837   0.130595     0.120438   
1630       0.008823                   0.004412   0.021617     0.134995   
5065       0.011765                   0.004902   0.000000     0.071571   
6159       0.008033                   0.004228   0.265519     0.101895   
7323       0.005646                   0.002823   0.313658     0.072141   

      len_description  len_name  review_scores_rating  security_deposit  \
4363         0.317205  0.016671              0.046307          0.000000   
7552         0.403995  0.010908              0.040399          0.000000   
7074         0.400614  0.012419              0.038459          0.080123   
4687         0.434275  0.019977              0.043428          0.108569   
886          0.402639  0.011677              0.039861          0.161056   
...               ...       ...                   ...               ...   
539          0.120438  0.009674              0.048369          0.000000   
1630         0.404986  0.013235              0.043234          0.154407   
5065         0.087748  0.014706              0.049021          0.000000   
6159         0.422801  0.011838              0.038475          0.105700   
7323         0.313658  0.010037              0.030425          0.627316   

          year     House  Apartment  Entire_Space  Shared_room  \
4363  0.934018  0.000463   0.000000      0.000463          0.0   
7552  0.814454  0.000000   0.000404      0.000000          0.0   
7074  0.808038  0.000401   0.000000      0.000401          0.0   
4687  0.876368  0.000434   0.000000      0.000434          0.0   
886   0.812928  0.000000   0.000403      0.000403          0.0   
...        ...       ...        ...           ...          ...   
539   0.975113  0.000000   0.000484      0.000000          0.0   
1630  0.889381  0.000000   0.000441      0.000000          0.0   
5065  0.989741  0.000000   0.000490      0.000000          0.0   
6159  0.852790  0.000000   0.000000      0.000423          0.0   
7323  0.632649  0.000314   0.000000      0.000314          0.0   

      bed_type_Airbed  bed_type_Couch  bed_type_Futon  bed_type_Pull-out Sofa  \
4363              0.0             0.0             0.0                     0.0   
7552              0.0             0.0             0.0                     0.0   
7074              0.0             0.0             0.0                     0.0   
4687              0.0             0.0             0.0                     0.0   
886               0.0             0.0             0.0                     0.0   
...               ...             ...             ...                     ...   
539               0.0             0.0             0.0                     0.0   
1630              0.0             0.0             0.0                     0.0   
5065              0.0             0.0             0.0                     0.0   
6159              0.0             0.0             0.0                     0.0   
7323              0.0             0.0             0.0                     0.0   

      bed_type_Real Bed   month_1   month_2  month_3  month_4   month_5  \
4363           0.000463  0.000000  0.000000      0.0      0.0  0.000000   
7552           0.000404  0.000000  0.000000      0.0      0.0  0.000000   
7074           0.000401  0.000000  0.000000      0.0      0.0  0.000000   
4687           0.000434  0.000000  0.000000      0.0      0.0  0.000000   
886            0.000403  0.000403  0.000000      0.0      0.0  0.000000   
...                 ...       ...       ...      ...      ...       ...   
539            0.000484  0.000000  0.000000      0.0      0.0  0.000484   
1630           0.000441  0.000000  0.000000      0.0      0.0  0.000000   
5065           0.000490  0.000000  0.000000      0.0      0.0  0.000000   
6159           0.000423  0.000000  0.000000      0.0      0.0  0.000000   
7323           0.000314  0.000000  0.000314      0.0      0.0  0.000000   

      month_6   month_7   month_8   month_9  month_10  month_11  month_12  
4363      0.0  0.000000  0.000000  0.000463       0.0       0.0       0.0  
7552      0.0  0.000000  0.000000  0.000404       0.0       0.0       0.0  
7074      0.0  0.000401  0.000000  0.000000       0.0       0.0       0.0  
4687      0.0  0.000000  0.000434  0.000000       0.0       0.0       0.0  
886       0.0  0.000000  0.000000  0.000000       0.0       0.0       0.0  
...       ...       ...       ...       ...       ...       ...       ...  
539       0.0  0.000000  0.000000  0.000000       0.0       0.0       0.0  
1630      0.0  0.000441  0.000000  0.000000       0.0       0.0       0.0  
5065      0.0  0.000490  0.000000  0.000000       0.0       0.0       0.0  
6159      0.0  0.000000  0.000423  0.000000       0.0       0.0       0.0  
7323      0.0  0.000000  0.000000  0.000000       0.0       0.0       0.0  

[192805 rows x 36 columns]

In [None]:
X_val = tf.keras.utils.normalize(X_val, axis=1).values
y_val = np.array(y_val)
preds = model.predict(X_val) 
rmse = np.sqrt(mean_squared_error(np.exp(y_val), np.exp(preds)))
r2 = r2_score(np.exp(y_val), np.exp(preds))
print(rmse, r2)

## Add More Layers

In [28]:
model1 = Sequential()

#Input layer
model1.add(Dense(1))
model1.add(Activation("relu"))

#Second layer
model1.add(Flatten())
model1.add(Dense(64))
model1.add(Activation('relu'))

#Third layer
model1.add(Flatten())
model1.add(Dense(64))
model1.add(Activation('relu'))

#Fourth layer
model1.add(Flatten())
model1.add(Dense(64))
model1.add(Activation('relu'))

#Output layer
model1.add(Dense(1))
model1.add(Activation('relu'))

model1.compile(loss='mean_squared_error',
             optimizer='adam',
             metrics=['MeanSquaredError'])
model1.fit(X_train,y_train, epochs=30, validation_split=0.1)

Train on 173524 samples, validate on 19281 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1a54fdf990>

In [2]:
preds = model1.predict(X_val)
rmse = np.sqrt(mean_squared_error(np.exp(y_val), np.exp(preds)))
r2 = r2_score(y_val, preds)
print(rmse, r2)

NameError: name 'model1' is not defined