In [None]:
# Import libraries here
from glob import glob
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline

Prepare Data


Import

In [None]:
# Write a wrangle function that takes the name of a CSV file as input and returns a DataFrame. The function should do the following steps:

#Subset the data in the CSV file and return only apartments in Mexico City ("Distrito Federal") that cost less than $100,000.
#Remove outliers by trimming the bottom and top 10% of properties in terms of "surface_covered_in_m2".
#Create separate "lat" and "lon" columns.
#Mexico City is divided into 16 boroughs. Create a "borough" feature from the "place_with_parent_names" column.
#Drop columns that are more than 50% null values.
#Drop columns containing low- or high-cardinality categorical values.
#Drop any columns that would constitute leakage for the target "price_aprox_usd".
#Drop any columns that would create issues of multicollinearity.
# Build your `wrangle` function

def wrangle(filepath):
    #Import file
    df=pd.read_csv(filepath)
    
    #Subset Data: Apartment in distrito Federal
    mask_ba=df['place_with_parent_names'].str.contains('Distrito Federal')
    mask_apt=df['property_type']=='apartment'
    mask_price=df['price_aprox_usd']<100000
    df=df[mask_ba & mask_apt & mask_price]
    
    #Remove outliners
    low,high=df['surface_covered_in_m2'].quantile([0.1,0.9])
    mask_area=df['surface_covered_in_m2'].between(low,high)
    df=df[mask_area]
    
    #split lat-lon column
    df[['lat','lon']]=df['lat-lon'].str.split(',',expand=True).astype(float)
    df.drop(columns='lat-lon', inplace=True)
    
    #Create borough
    df['borough'] = df['place_with_parent_names'].str.split('|', expand=True)[1]
    df.drop(columns='place_with_parent_names', inplace=True)
    
    
    #Drop features with high null count
    df.drop(columns=['surface_total_in_m2', 'price_usd_per_m2','floor','rooms', 'expenses'], inplace=True)
    
    #low and high cardinality
    df.drop(columns=['operation', 'property_type','currency', 'properati_url'], inplace=True)
    
    #drop leakage value
    df.drop(columns=['price', 'price_aprox_local_currency', 'price_per_m2',], inplace=True)
    
    
    return df 

In [None]:
#Use glob to create the list files. It should contain the filenames of all the Mexico City real estate CSVs in the ./data directory, except for mexico-city-test-features.csv
files=glob('data/mexico-city-real-estate-*.csv')
files

In [None]:
# Combine your wrangle function, a list comprehension, and pd.concat to create a DataFrame df. It should contain all the properties from the five CSVs in files.
df = pd.concat(frames, ignore_index=True)
df.head(10)

Explore


In [None]:
#Create a histogram showing the distribution of apartment prices ("price_aprox_usd") in df. Be sure to label the x-axis "Area [sq meters]", the y-axis "Count", and give it the title "Distribution of Apartment Prices". Use Matplotlib (plt)
# Build histogram
plt.hist(df['price_aprox_usd'])


# Label axes
plt.xlabel('Area [sq meters]')
plt.ylabel('Count')

# Add title
plt.title('Distribution of Apartment Prices')

# Don't delete the code below 👇
plt.savefig("images/2-5-4.png", dpi=150)

In [None]:
#Create a scatter plot that shows apartment price ("price_aprox_usd") as a function of apartment size ("surface_covered_in_m2"). Be sure to label your axes "Price [USD]" and "Area [sq meters]", respectively. Your plot should have the title "Mexico City: Price vs. Area". Use Matplotlib (plt)
# Build scatter plot
plt.scatter(x=df['surface_covered_in_m2'], y=df['price_aprox_usd'])


# Label axes
plt.xlabel('Area[sq meters]')
plt.ylabel('Price [USD]')

# Add title
plt.title('Mexico City: Price vs Area')

# Don't delete the code below 👇
plt.savefig("images/2-5-5.png", dpi=150)

In [None]:
#Create a Mapbox scatter plot that shows the location of the apartments in your dataset and represent their price using color.

What areas of the city seem to have higher real estate prices?
# Plot Mapbox location and price
fig = px.scatter_mapbox(
    df,
    lat='lat',
    lon='lon',
    center={"lat": 19.43, "lon": -99.13},  # Map will be centered on Mexico City
    width=600,  # Width of map
    height=600,  # Height of map
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

Split

In [None]:
# Split data into feature matrix `X_train` and target vector `y_train`.
target='price_aprox_usd'
feature=['surface_covered_in_m2', 'lat', 'lon','borough']
X_train = df[feature]
y_train = df[target]

Build Model

Baseline

In [None]:
#Calculate the baseline mean absolute error for your model.
y_mean = y_train.mean()
y_pred_baseline = [y_mean]*len(y_train)
baseline_mae =mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

Iterate

In [None]:
# Build Model
model = make_pipeline(
OneHotEncoder(),
SimpleImputer(),
Ridge()
)
# Fit model
model.fit(X_train, y_train)

Evaluate

In [None]:
# Read the CSV file mexico-city-test-features.csv into the DataFrame X_test
X_test = pd.read_csv('data/mexico-city-test-features.csv')
print(X_test.info())
X_test.head()

In [None]:
#Use your model to generate a Series of predictions for X_test. When you submit your predictions to the grader, it will calculate the mean absolute error for your model.
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()

Communicate Result

In [None]:
#Create a Series named feat_imp. The index should contain the names of all the features your model considers when making predictions; the values should be the coefficient values associated with each feature. The Series should be sorted ascending by absolute value.
coefficients = model.named_steps['ridge'].coef_
feature_names = model.named_steps['onehotencoder'].get_feature_names()
feat_imp = pd.Series(coefficients, index=feature_names).abs().sort_values()
feat_imp

In [None]:
#Create a horizontal bar chart that shows the 10 most influential coefficients for your model. Be sure to label your x- and y-axis "Importance [USD]" and "Feature", respectively, and give your chart the title "Feature Importances for Apartment Price"
# Build bar chart
feat_imp.plot(kind='barh')


# Label axes

plt.xlabel('Importance [USD]')
plt.ylabel('Feature')
# Add title
plt.title('Feature Importance for Apartment Price')

# Don't delete the code below 👇
plt.savefig("images/2-5-13.png", dpi=150)
