In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import geopandas as gpd
from geodatasets import get_path
import matplotlib.pyplot as plt
import contextily as cx
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import dill

Shape of the Dataset.

In [None]:
df = pd.read_csv('airbnb.csv', low_memory=False)
m, n = df.shape
m

- $y^{(i)}$ is the label or target

In [None]:
y = df['price'].copy()
y[0]

- X is a matrix containing all the data

In [None]:
X = df.drop('price', axis=1).copy()
X.head()

- $\mathbf{x^{(i)}}$ is a vector of feature values

In [None]:
x = X.iloc[0,:]
x

Viewing the Data.

In [None]:
df.head()

Inspect Column Names.

In [None]:
df.columns

Basic Info.

In [None]:
df.info()

Inspecting Variables.

In [None]:
df['room_type'].value_counts()

In [None]:
df.describe()

Viewing the distribution of all the variables.

In [None]:

df.hist(bins=50, figsize=(16,10))
plt.show()

Train-Test Split.

In [None]:

#X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=42)
train, test  = train_test_split(df, test_size=0.2, random_state=42)

Checking Distribution of Geography.

In [None]:
df['neighbourhood_group'].value_counts(normalize=True)*100

In [None]:
train['neighbourhood_group'].value_counts(normalize=True)

Stratifying the Split by Neighbourhood Groups.

In [None]:
train_strat, test_strat  = train_test_split(df, test_size=0.2, stratify=df['neighbourhood_group'], random_state=42)

In [None]:
train_strat['neighbourhood_group'].value_counts(normalize=True)

In [None]:
combined = pd.DataFrame({'overall': df['neighbourhood_group'].value_counts(normalize=True)*100,
                         'stratified': train_strat['neighbourhood_group'].value_counts(normalize=True)*100,
                         'random': train['neighbourhood_group'].value_counts(normalize=True)*100}).reset_index()
combined['strat_error'] = (combined['stratified'] /
                           combined['overall'] - 1)*100
combined['random_error'] = (combined['random'] / combined['overall'] - 1)*100
combined

Geographic Graphing

In [None]:
train_strat[train_strat['price']<500].plot(kind="scatter", x="longitude", y="latitude", grid=True, label="price",
             c="price", cmap="jet", colorbar=True,
             legend=True, sharex=False, figsize=(10, 7), alpha=0.2)
plt.show()

In [None]:
#!pip install geopandas geodatasets folium matplotlib mapclassify contextily



filt = train_strat[train_strat['price']<300]
train_strat_gdf = gpd.GeoDataFrame(filt, geometry=gpd.points_from_xy(filt.longitude, filt.latitude), crs="EPSG:4326")

# initialize an axis
fig, ax = plt.subplots(figsize=(20,14))
# plot map on axis
path_to_data = get_path("nybb")
boroughs_gdf = gpd.read_file(path_to_data)
boroughs_gdf = boroughs_gdf.set_geometry("geometry")
boroughs_gdf = boroughs_gdf.to_crs("EPSG:4326")
boroughs_gdf.plot(ax=ax, alpha=0.5, column='BoroName',
    categorical=True, legend=True, cmap='Spectral', linewidth=2, edgecolor='0')
# plot points
train_strat_gdf.plot(column='price', ax=ax, cmap='jet', legend=True, alpha=0.4)
cx.add_basemap(ax, crs=train_strat_gdf.crs)
# add grid
ax.grid(alpha=0.5)
plt.title('AirBnB Rentals by Price in NYC under $300')
plt.show()


Viewing the Correlation of our Variables with our Target: "Price"

In [None]:
corr_matrix = train_strat.corr(numeric_only=True)
corr_matrix['price'].sort_values(ascending=False)

In [None]:
numeric = train_strat.select_dtypes(include=['int', 'float']).drop(['id', 'host_id'], axis=1)

In [None]:

sns.pairplot(train_strat.sample(n=200).drop(['id', 'host_id', 'license'], axis=1), hue="neighbourhood_group")
plt.show()

In [None]:
sns.boxplot(train_strat[train_strat['price']<300], x='neighbourhood_group', y='price')
plt.show()

In [None]:
train_strat['long_lat'] = train_strat['longitude'] + train_strat['latitude']
corr_matrix = train_strat.corr(numeric_only=True)
corr_matrix['price'].sort_values(ascending=False)

In [None]:
X = train_strat.drop('price', axis=1).copy()
y = train_strat['price']
X = X.drop(['name', 'host_name', 'id', 'host_id', 'license', 'last_review'], axis=1)

In [None]:
X.info()

Handling NAs

In [None]:
# X.dropna(subset=["reviews_per_month"], inplace=True)    # option 1
# X.drop("reviews_per_month", axis=1)       # option 2
null_rows_idx = X.isnull().any(axis=1)
X_nulls = X.copy()
X_nulls["reviews_per_month"] = X_nulls["reviews_per_month"].fillna(0)
X_nulls.loc[null_rows_idx].head()

In [None]:


#imputer = SimpleImputer(strategy='constant', fill_value=0)
imputer = SimpleImputer(strategy='median')
X_rpm = X[['reviews_per_month']]
imputer.fit(X_rpm)

In [None]:
imputer.statistics_

In [None]:
X[['reviews_per_month']].median()

In [None]:
X_nulls = imputer.transform(X[['reviews_per_month']])
X_nulls[null_rows_idx]

Categorical Attributes.

In [None]:
X['neighbourhood_group'].value_counts()

In [None]:
X_cat = X[['neighbourhood_group']]

In [None]:


ordinal_encoder = OrdinalEncoder()
X_cat_encoded = ordinal_encoder.fit_transform(X_cat)

In [None]:
X_cat_encoded[:8]

In [None]:
ordinal_encoder.categories_

In [None]:


cat_encoder = OneHotEncoder()
X_cat_1hot = cat_encoder.fit_transform(X_cat)

In [None]:
X_cat_1hot

In [None]:
X_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

In [None]:
pd.get_dummies(X_cat)

In [None]:
cat_encoder.feature_names_in_

In [None]:
cat_encoder.get_feature_names_out()

In [None]:
df_output = pd.DataFrame(cat_encoder.transform(X_cat).toarray(),
                    columns=cat_encoder.get_feature_names_out(),
                   index=X.index)

df_output

Feature Scaling

In [None]:

X_num = X.select_dtypes(['int', 'float'])
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
X_num_min_max_scaled = min_max_scaler.fit_transform(X_num)
X_num_min_max_scaled

In [None]:
std_scaler = StandardScaler()
X_num_std_scaled = std_scaler.fit_transform(X_num)
X_num_std_scaled

Transforming Input and Output

In [None]:
target_scaler = StandardScaler()
scaled_labels = target_scaler.fit_transform(y.to_frame())

model = LinearRegression()
model.fit(X[["longitude"]], scaled_labels)
some_new_data = X[["longitude"]].iloc[:5]  # pretend this is new data

scaled_predictions = model.predict(some_new_data)
predictions = target_scaler.inverse_transform(scaled_predictions)
predictions

In [None]:

model = TransformedTargetRegressor(LinearRegression(),
                                   transformer=StandardScaler())
model.fit(X[["longitude"]], y)
predictions = model.predict(some_new_data)
predictions

Putting in a Pipeline.

In [None]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

In [None]:
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [None]:
X_num_prepared = num_pipeline.fit_transform(X_num)
X_num_prepared[:2].round(2)

In [None]:
X_num_prepared = pd.DataFrame(
    X_num_prepared, columns=num_pipeline.get_feature_names_out(),
    index=X_num.index)

X_num_prepared

In [None]:
X_num_prepared.columns

In [None]:


num_attribs = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
               'reviews_per_month', 'calculated_host_listings_count',
               'availability_365', 'number_of_reviews_ltm']
cat_attribs = ["neighbourhood_group"]

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

In [None]:
X = df.drop('price', axis=1).copy()
y = df['price']
X = X.drop(['name', 'host_name', 'id', 'host_id', 'license', 'last_review'], axis=1)
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, stratify=X['neighbourhood_group'], random_state=42)

In [None]:


lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)

In [None]:
pred = lin_reg.predict(X_train)
pred[:5].round(2)

In [None]:

lin_rmse = root_mean_squared_error(y_train, pred)
lin_rmse

In [None]:
pred_test = lin_reg.predict(X_test)
lin_rmse_test = root_mean_squared_error(y_test, pred_test)
lin_rmse_test

In [None]:


tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(X_train, y_train)

In [None]:
pred = tree_reg.predict(X_train)
tree_rmse = root_mean_squared_error(y_train, pred)
tree_rmse

In [None]:
pred_test = tree_reg.predict(X_test)
tree_rmse_test = root_mean_squared_error(y_test, pred_test)
tree_rmse_test

In [None]:


tree_rmses = -cross_val_score(tree_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(tree_rmses).describe()

In [None]:


forest_reg = make_pipeline(preprocessing,
                           RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, X_train, y_train,
                                scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(forest_rmses).describe()

K-Fold Cross Validation

In [None]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42)),
])
param_grid = [
    {'random_forest__max_features': [4, 6, 8, 10]}
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Getting the Final Model.

In [None]:
final_model = grid_search.best_estimator_  # includes preprocessing
feature_importances = final_model["random_forest"].feature_importances_
feature_importances.round(2)

In [None]:
sorted(zip(feature_importances,
           final_model["preprocessing"].get_feature_names_out()),
           reverse=True)

In [None]:
final_predictions = final_model.predict(X_test)

final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)

Saving Model for Later.

In [None]:
dill.settings['recurse'] = True
dill.dump(final_model, open('model.pkl','wb'))

In [None]:
model_saved = dill.load(open('model.pkl','rb'))
y_pred = model_saved.predict(X_test)
y_pred