In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from sklearn.ensemble import BaggingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import make_pipeline

In [51]:
train = pd.read_csv('/Users/sunmingze/Desktop/Airbnb/train-2.csv')
test = pd.read_csv('/Users/sunmingze/Desktop/Airbnb/test.csv')

In [52]:
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)
train['train_data'] = 'Y'
test['train_data'] = 'N'

In [53]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["price"] = np.log1p(train["price"])
train = train.drop(train[(train['security_deposit']>2000)].index)
all_data = pd.concat((train, test), sort=True).reset_index(drop=True)
print("all_data size is : {}".format(all_data.shape))
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

all_data size is : (1998, 36)


Unnamed: 0,Missing Ratio
price,50.05005
security_deposit,32.132132
host_response_time,30.880881
host_response_rate,30.880881
review_scores_value,25.925926
review_scores_location,25.875876
review_scores_communication,25.825826
review_scores_cleanliness,25.825826
review_scores_checkin,25.825826
review_scores_accuracy,25.825826


In [54]:
imputer = SimpleImputer(strategy='most_frequent')
_ = imputer.fit(all_data)
X = imputer.transform(all_data)
all_data1 = pd.DataFrame(X, columns=all_data.columns)

In [55]:
geo = all_data1[['longitude', 'latitude']]
km = KMeans(n_clusters=15, random_state=28).fit(geo)
all_data1['geo'] = km.labels_

In [56]:
all_data1 = all_data1.drop('longitude', axis=1)
all_data1 = all_data1.drop('latitude', axis=1)

In [57]:
cat_var = ['experiences_offered', 'host_response_time', 'host_is_superhost', 'host_identity_verified',
          'property_type', 'room_type', 'bed_type', 'instant_bookable', 'cancellation_policy',
          'require_guest_profile_picture', 'require_guest_phone_verification', 'geo']
num_var = ['host_response_rate', 'host_listings_count', 'accommodates',
          'bathrooms', 'bedrooms', 'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
          'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 'review_scores_rating',
          'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
          'review_scores_location', 'review_scores_value', 'reviews_per_month']

In [58]:
numeric_feats = num_var

# Check the skew of all numerical features
skewed_feats = all_data1[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)


Skew in numerical features: 



Unnamed: 0,Skew
minimum_nights,9.923356
host_listings_count,6.938758
security_deposit,6.303798
number_of_reviews,4.839071
cleaning_fee,3.577277
bathrooms,3.546345
guests_included,3.023952
reviews_per_month,2.823368
extra_people,2.44677
beds,2.068795


In [59]:
all_data2 = pd.get_dummies(all_data1, columns=cat_var, prefix=cat_var)
all_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998 entries, 0 to 1997
Data columns (total 89 columns):
accommodates                                       1998 non-null object
bathrooms                                          1998 non-null object
bedrooms                                           1998 non-null object
beds                                               1998 non-null object
cleaning_fee                                       1998 non-null object
extra_people                                       1998 non-null object
guests_included                                    1998 non-null object
host_listings_count                                1998 non-null object
host_response_rate                                 1998 non-null object
maximum_nights                                     1998 non-null object
minimum_nights                                     1998 non-null object
number_of_reviews                                  1998 non-null object
price                        

In [60]:
min_max_scaler = MinMaxScaler()
all_data2[num_var] = min_max_scaler.fit_transform(all_data2[num_var])

  return self.partial_fit(X, y)


In [61]:
skew = abs(all_data2[num_var].skew()).sort_values(ascending = False)
skew_features = skew[skew > 0.75]
print(skew_features)

minimum_nights                 9.930813
host_listings_count            6.943972
security_deposit               6.308535
review_scores_communication    5.115893
review_scores_checkin          4.998740
number_of_reviews              4.842708
host_response_rate             4.662728
review_scores_accuracy         3.807421
cleaning_fee                   3.579965
bathrooms                      3.549010
review_scores_rating           3.077185
review_scores_location         3.057814
review_scores_cleanliness      3.034182
guests_included                3.026225
reviews_per_month              2.825489
review_scores_value            2.596126
extra_people                   2.448609
beds                           2.070349
bedrooms                       1.509477
accommodates                   1.447314
dtype: float64


In [63]:
training = all_data2[all_data2['train_data']=='Y']
testing = all_data2[all_data2['train_data']=='N']

In [64]:
testing.drop('train_data', axis=1, inplace=True)
training.drop('train_data', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [65]:
train_set, test_set = train_test_split(training, test_size=0.2, random_state=42)
X_train = train_set.drop('price', axis=1)
y_train = train_set['price'].copy()
X_test = test_set.drop('price', axis=1)
y_test = test_set['price'].copy()

In [67]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y_train, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [70]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [74]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))

In [77]:
score = cv_rmse(ridge, X_train)
score = cv_rmse(lasso, X_train)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), )

score = cv_rmse(elasticnet, X_train)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), )



LASSO: 0.3578 (0.0264)

elastic net: 0.3583 (0.0270)

