
<div class="alert alert-info" role="alert">
  <p>
    <b>Usage Guidelines</b>
  </p>

  <p>
    This lesson is part of the <b>DS Lab core curriculum</b>. For that reason, this notebook can only be used on your WQU virtual machine.
  </p>

  <p>
    This means:
    <ul>
      <li><span style="color: red">ⓧ</span> No downloading this notebook.</li>
      <li><span style="color: red">ⓧ</span> No re-sharing of this notebook with friends or colleagues.</li>
      <li><span style="color: red">ⓧ</span> No downloading the embedded videos in this notebook.</li>
      <li><span style="color: red">ⓧ</span> No re-sharing embedded videos with friends or colleagues.</li>
      <li><span style="color: red">ⓧ</span> No adding this notebook to public or private repositories.</li>
      <li><span style="color: red">ⓧ</span> No uploading this notebook (or screenshots of it) to other websites, including websites for study resources.</li>
    </ul>

  </p>
</div>


# <font size="+3"><strong>2.5. Predicting Apartment Prices in Mexico City 🇲🇽</strong></font>

In [1]:
import warnings

import wqet_grader

warnings.simplefilter(action="ignore", category=FutureWarning)
wqet_grader.init("Project 2 Assessment")

Exception: Could not connect to Grading Service API: HTTPConnectionPool(host='localhost', port=2400): Max retries exceeded with url: /1/track (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x14a8c2920>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [3]:
# Import libraries here
from glob import glob

import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
# Build your `wrangle` function
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Distrito Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Get place name
    df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    df.drop(columns="place_with_parent_names", inplace=True)
    
    df.drop(columns=['surface_total_in_m2', 'floor', 'rooms', 'expenses'], inplace=True)
    
    df.drop(columns=['price',
            'price_aprox_local_currency',
            'price_per_m2',
            'price_usd_per_m2'], 
            inplace=True)
                     
    df.drop(columns=['operation', 'property_type', 'currency', 'properati_url'], inplace=True)
    
    return df

In [None]:
# Use this cell to test your wrangle function and explore the data

df = wrangle('data/mexico-city-real-estate-1.csv')
#df = pd.read_csv('data/mexico-city-real-estate-1.csv')
df.head()
#df.isna().sum()*100.0/len(df)

In [None]:

wqet_grader.grade(
    "Project 2 Assessment", "Task 2.5.1", wrangle("data/mexico-city-real-estate-1.csv")
)

In [None]:

files = glob('data/mexico-city-real-estate-*.csv')

In [None]:
wqet_grader.grade("Project 2 Assessment", "Task 2.5.2", files)

In [None]:
frames = []
for file in files:
    frames.append(wrangle(file))

df = pd.concat(frames, )
print(df.info())
df.head()

In [None]:

wqet_grader.grade("Project 2 Assessment", "Task 2.5.3", df)

In [None]:
# Build histogram
plt.hist(df['price_aprox_usd'])


# Label axes
plt.xlabel('Price [$]')

# Add title


# Don't delete the code below 👇
plt.savefig("images/2-5-4.png", dpi=150)


In [None]:
with open("images/2-5-4.png", "rb") as file:
    wqet_grader.grade("Project 2 Assessment", "Task 2.5.4", file)

In [None]:
# Build scatter plot
plt.scatter(df['surface_covered_in_m2'],df['price_aprox_usd'])


# Label axes
plt.xlabel('Area [sq meters]')
plt.ylabel('Price [USD]')


# Add title
plt.title('Mexico City: Price vs. Area')

# Don't delete the code below 👇
plt.savefig("images/2-5-5.png", dpi=150)


In [None]:
with open("images/2-5-5.png", "rb") as file:
    wqet_grader.grade("Project 2 Assessment", "Task 2.5.5", file)

In [None]:
# Plot Mapbox location and price
fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat='lat',
    lon='lon',
    width=600,  # Width of map
    height=600,  # Height of map
    color='price_aprox_usd',
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [None]:
# Split data into feature matrix `X_train` and target vector `y_train`.
features = ['surface_covered_in_m2', 'lat', 'lon', 'borough']
target = 'price_aprox_usd'
X_train = df[features]
y_train = df[target]

In [None]:

wqet_grader.grade("Project 2 Assessment", "Task 2.5.7a", X_train)

In [None]:

wqet_grader.grade("Project 2 Assessment", "Task 2.5.7b", y_train)

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean]*len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

In [None]:
wqet_grader.grade("Project 2 Assessment", "Task 2.5.8", [baseline_mae])

In [None]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
    )
# Fit model
model.fit(X_train, y_train)

In [None]:

wqet_grader.grade("Project 2 Assessment", "Task 2.5.9", model)

In [None]:
X_test = pd.read_csv('data/mexico-city-test-features.csv')
print(X_test.info())
X_test.head()

In [None]:
X_train.head()

In [None]:

wqet_grader.grade("Project 2 Assessment", "Task 2.5.10", X_test)

In [None]:
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()

In [None]:
wqet_grader.grade("Project 2 Assessment", "Task 2.5.11", y_test_pred)

In [None]:
coefficients = model.named_steps['ridge'].coef_
features =  model.named_steps['onehotencoder'].get_feature_names()
feat_imp = pd.Series(coefficients, index=features)
feat_imp

In [None]:

wqet_grader.grade("Project 2 Assessment", "Task 2.5.12", feat_imp)

In [None]:
# Build bar chart
feat_imp.sort_values(key=abs).tail(15).plot(kind='barh')


# Label axes
plt.xlabel('Importance [USD]')
plt.ylabel('Feature')

# Add title
plt.title('Feature Importances for Apartment Price')

# Don't delete the code below 👇
plt.savefig("images/2-5-13.png", dpi=150)


In [None]:
with open("images/2-5-13.png", "rb") as file:
    wqet_grader.grade("Project 2 Assessment", "Task 2.5.13", file)