## Predicting Price with Neighborhood


In [15]:
import warnings
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)


# IMPORT


In [16]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    df["neighborhood"] = df["place_with_parent_names"].str.split("|").str[3]
    df.drop(columns="place_with_parent_names", inplace=True)

    return df
    

# uze of glob - pattern to include 

In [17]:
files = glob("data/buenos-aires-real-estate-*.csv") 
files

frames = []
for file in files:
    df= wrangle(file)
    frames.append(df)


# concate the frames

In [18]:
df = pd.concat(frames, ignore_index=True)   
df.head()

Unnamed: 0,operation,property_type,price,currency,price_aprox_local_currency,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,lat,lon,neighborhood
0,sell,apartment,120000.0,USD,1819488.0,120000.0,,55.0,,2181.818182,,2.0,,http://villa-general-mitre.properati.com.ar/xx...,-34.616004,-58.470506,Villa General Mitre
1,sell,apartment,89000.0,USD,1349453.6,89000.0,,37.0,,2405.405405,7.0,2.0,,http://palermo.properati.com.ar/ya5i_venta_dep...,-34.584712,-58.444927,Palermo
2,sell,apartment,183495.0,USD,2782224.58,183495.0,92.0,57.0,1994.51087,3219.210526,,2.0,,http://saavedra.properati.com.ar/12izq_venta_d...,-34.554652,-58.493644,Saavedra
3,sell,apartment,95000.0,USD,1440428.0,95000.0,53.0,47.0,1792.45283,2021.276596,,2.0,,http://villa-del-parque.properati.com.ar/wy0n_...,-34.610581,-58.479625,Villa del Parque
4,sell,apartment,95000.0,USD,1440428.0,95000.0,0.0,35.0,,2714.285714,,1.0,,http://belgrano.properati.com.ar/xw9a_venta_de...,-34.558227,-58.458357,Belgrano


# Explore the df

In [19]:
df.shape

(6582, 17)

# Create your feature matrix X_train and target vector y_train. X_train should contain one feature: "neighborhood". Your target is "price_aprox_usd".



In [20]:
target = "price_aprox_usd"
features = ["neighborhood"]
y_train = df[target]
X_train = df[features]

## Build Model - baseline

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(df)
print("Mean apt price " ,y_mean)
print("Baseline price " ,mean_absolute_error(y_train, y_pred_baseline))

## ITerate First, instantiate a OneHotEncoder named ohe. Make sure to set the use_cat_names argument to True. Next, fit your transformer to the feature matrix X_train. Finally, use your encoder to transform the feature matrix X_train, and assign the transformed data to the variable XT_train.

use Linerar regression , get error as nebighnourhood is string but we need float as math exp need numeric 

In [None]:
ohe = OneHotEncoder(use_cat_names=True)
ohe.fit(X_train)
XT_train = ohe.transform(X_train)
XT_train.head()

In [None]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
      LinearRegression()
      )
model.fit(X_train, y_train)

y_pred = model.predict(X_train)

print("Mean apt price " ,y_mean)
print("Baseline price " ,mean_absolute_error(y_train, y_pred_baseline))


In [None]:
y_pred_training = model.predict(X_train)
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Mean apt price " ,y_mean)
print("Baseline price " ,mean_absolute_error(y_train, y_pred_baseline))
print("Training MAE " ,mae_training)


In [26]:
X_test = pd.read_csv("data/buenos-aires-test-features.csv")[features]
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()

0    249409.478261
1    161530.079797
2     98036.930192
3    110675.394444
4    127796.268745
dtype: float64

In [27]:
intercept = model.named_steps["linearregression"].intercept_
coefficients = model.named_steps["linearregression"].coef_
print("coefficients len:", len(coefficients))
print(coefficients[:5])  # First five coefficients

## Task 2.3.11: Create a pandas Series named y_pred_test that contains the predicted values for the test data.

coefficients len: 57
[  9152.9828972   47056.21913214  15837.7992972  -11670.09791089
  48091.90019792]


In [28]:
feature_names = model.named_steps["onehotencoder"].get_feature_names_out()
print("features len:", len(feature_names))
print(feature_names[:5])  # First five feature names

## Task 2.3.12: Create a pandas Series named y_pred_test that contains the predicted values for the test data.

features len: 57
['neighborhood_Villa General Mitre' 'neighborhood_Palermo'
 'neighborhood_Saavedra' 'neighborhood_Villa del Parque'
 'neighborhood_Belgrano']


In [29]:
feat_imp = pd.Series(model.named_steps["linearregression"].coef_, index=feature_names).sort_values()
feat_imp.head()

neighborhood_Villa Soldati   -73008.387103
neighborhood_Pompeya         -51638.879103
neighborhood_Villa Lugano    -49137.642103
neighborhood_Catalinas       -42972.017103
neighborhood_Constitución    -41623.852436
dtype: float64

In [None]:
print(f"price = {intercept.round(2)}")
for f, c in feat_imp.items():
    print(f"+ ({round(c, 2)} * {f})")