### Data Wrangling with Pandas

In [155]:
import pandas as pd

#### Importing data

In [156]:
# Load csv files into DataFrames
df1 = pd.read_csv("data/mexico-real-estate-1.csv", encoding='latin-1') 
df2 = pd.read_csv("data/mexico-real-estate-2.csv", encoding='latin-1')
df3 = pd.read_csv("data/mexico-real-estate-3.csv", encoding='latin-1')

# Object type, info and shape for DataFrames
print("df1 type:", type(df1))
print("df1 shape:", df1.shape)
print("df1 info:", df1.info())
print()

print("df2 type:", type(df2))
print("df2 shape:", df2.shape)
print("df2 info:", df2.info())
print()

print("df3 type:", type(df3))
print("df3 shape:", df3.shape)
print("df3 info:", df3.info())
print()

df1 type: <class 'pandas.core.frame.DataFrame'>
df1 shape: (700, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     700 non-null    int64  
 1   property_type  700 non-null    object 
 2   state          700 non-null    object 
 3   lat            583 non-null    float64
 4   lon            583 non-null    float64
 5   area_m2        700 non-null    int64  
 6   price_usd      700 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 38.4+ KB
df1 info: None

df2 type: <class 'pandas.core.frame.DataFrame'>
df2 shape: (700, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     700 non-null    int64  
 1   property_type  700 non-null    object 
 2   

In [157]:
df1.head()
# NaN means not a number

Unnamed: 0.1,Unnamed: 0,property_type,state,lat,lon,area_m2,price_usd
0,1,house,Estado de México,19.560181,-99.233528,150,"$67,965.56"
1,2,house,Nuevo León,25.688436,-100.198807,186,"$63,223.78"
2,3,apartment,Guerrero,16.767704,-99.764383,82,"$84,298.37"
3,4,apartment,Guerrero,16.829782,-99.911012,150,"$94,308.80"
4,5,house,Veracruz de Ignacio de la Llave,,,175,"$94,835.67"


In [158]:
df2.head()

Unnamed: 0.1,Unnamed: 0,property_type,state,lat,lon,area_m2,price_mxn
0,1,apartment,Nuevo León,25.721081,-100.345581,72,1300000
1,2,apartment,Puebla,,,190,2500000
2,3,house,Morelos,23.634501,-102.552788,360,5300000
3,4,house,Morelos,,,76,820000
4,5,house,Puebla,,,200,1100000


In [159]:
df3.head()

Unnamed: 0.1,Unnamed: 0,property_type,place_with_parent_names,lat-lon,area_m2,price_usd
0,1,apartment,|México|Distrito Federal|Gustavo A. Madero|Acu...,"19.52589,-99.151703",71,48550.59
1,2,house,|México|Estado de México|Toluca|Metepec|,"19.2640539,-99.5727534",233,168636.73
2,3,house,|México|Estado de México|Toluca|Toluca de Lerd...,"19.268629,-99.671722",300,86932.69
3,4,house,|México|Morelos|Temixco|Burgos Bugambilias|,,275,263432.41
4,5,apartment,|México|Veracruz de Ignacio de la Llave|Veracruz|,"19.511938,-96.871956",84,68508.67


#### Data cleaning

In [160]:
# 1. Drop missing values (NaN)
df1.dropna(inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 583 entries, 0 to 699
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     583 non-null    int64  
 1   property_type  583 non-null    object 
 2   state          583 non-null    object 
 3   lat            583 non-null    float64
 4   lon            583 non-null    float64
 5   area_m2        583 non-null    int64  
 6   price_usd      583 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 36.4+ KB


In [161]:
# Changing price_usd and area_m2 to float
# dropping the unnamed column

df1["price_usd"] = df1["price_usd"].str.replace("$","").str.replace(",","").astype(float)
df1["area_m2"] = df1["area_m2"].astype(float)
df1 = df1.drop("Unnamed: 0", axis=1)

In [162]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 583 entries, 0 to 699
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   property_type  583 non-null    object 
 1   state          583 non-null    object 
 2   lat            583 non-null    float64
 3   lon            583 non-null    float64
 4   area_m2        583 non-null    float64
 5   price_usd      583 non-null    float64
dtypes: float64(4), object(2)
memory usage: 31.9+ KB


In [163]:
df1.head()

Unnamed: 0,property_type,state,lat,lon,area_m2,price_usd
0,house,Estado de México,19.560181,-99.233528,150.0,67965.56
1,house,Nuevo León,25.688436,-100.198807,186.0,63223.78
2,apartment,Guerrero,16.767704,-99.764383,82.0,84298.37
3,apartment,Guerrero,16.829782,-99.911012,150.0,94308.8
5,house,Yucatán,21.052583,-89.538639,205.0,105191.37


In [164]:
df1.shape

(583, 6)

In [165]:
# 2. Handling df 2
df2.dropna(inplace=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 571 entries, 0 to 699
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     571 non-null    int64  
 1   property_type  571 non-null    object 
 2   state          571 non-null    object 
 3   lat            571 non-null    float64
 4   lon            571 non-null    float64
 5   area_m2        571 non-null    int64  
 6   price_mxn      571 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 35.7+ KB


In [166]:
#Changing price and area_m2 to float
# dropping the unnamed column
df2 = df2.drop("Unnamed: 0", axis=1)
df2["price_mxn"] = df2["price_mxn"].astype(float)
df2["area_m2"] = df2["area_m2"].astype(float)


# calculate price_usd
df2["price_usd"] = (df2["price_mxn"] / 19).round(2)

In [167]:
df2=df2.drop("price_mxn", axis=1)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 571 entries, 0 to 699
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   property_type  571 non-null    object 
 1   state          571 non-null    object 
 2   lat            571 non-null    float64
 3   lon            571 non-null    float64
 4   area_m2        571 non-null    float64
 5   price_usd      571 non-null    float64
dtypes: float64(4), object(2)
memory usage: 31.2+ KB


In [168]:
df2.head()

Unnamed: 0,property_type,state,lat,lon,area_m2,price_usd
0,apartment,Nuevo León,25.721081,-100.345581,72.0,68421.05
2,house,Morelos,23.634501,-102.552788,360.0,278947.37
6,apartment,Estado de México,19.27204,-99.572013,85.0,65789.47
7,house,San Luis Potosí,22.138882,-100.99651,158.0,111578.95
8,apartment,Distrito Federal,19.394558,-99.129707,65.0,39904.74


In [169]:
# 2. Handling df3
df3.shape

(700, 6)

In [170]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               700 non-null    int64  
 1   property_type            700 non-null    object 
 2   place_with_parent_names  700 non-null    object 
 3   lat-lon                  582 non-null    object 
 4   area_m2                  700 non-null    int64  
 5   price_usd                700 non-null    float64
dtypes: float64(1), int64(2), object(3)
memory usage: 32.9+ KB


In [171]:

#df3 = df3.drop("Unnamed: 0", axis=1)
df3.dropna(inplace=True)

In [172]:
df3[["lat", "lon"]]=df3["lat-lon"].str.split(",", expand=True)
df3.head()

Unnamed: 0.1,Unnamed: 0,property_type,place_with_parent_names,lat-lon,area_m2,price_usd,lat,lon
0,1,apartment,|México|Distrito Federal|Gustavo A. Madero|Acu...,"19.52589,-99.151703",71,48550.59,19.52589,-99.151703
1,2,house,|México|Estado de México|Toluca|Metepec|,"19.2640539,-99.5727534",233,168636.73,19.2640539,-99.5727534
2,3,house,|México|Estado de México|Toluca|Toluca de Lerd...,"19.268629,-99.671722",300,86932.69,19.268629,-99.671722
4,5,apartment,|México|Veracruz de Ignacio de la Llave|Veracruz|,"19.511938,-96.871956",84,68508.67,19.511938,-96.871956
5,6,house,|México|Jalisco|Guadalajara|,"20.689157,-103.366728",175,102763.0,20.689157,-103.366728


In [173]:
#create "state" column from palce_with_parent_names 
df3["state"]=df3["place_with_parent_names"].str.split("|", expand=True)[2]
df3.head()

# drop columns
df3.drop(columns=["place_with_parent_names", "lat-lon", "Unnamed: 0"], inplace=True)


In [None]:
df3["state"]=df3["place_with_parent_names"].str.split("|", expand=True)[2]

In [191]:
df3.head()

Unnamed: 0,property_type,area_m2,price_usd,lat,lon,state
0,apartment,71,48550.59,19.52589,-99.151703,Distrito Federal
1,house,233,168636.73,19.2640539,-99.5727534,Estado de México
2,house,300,86932.69,19.268629,-99.671722,Estado de México
4,apartment,84,68508.67,19.511938,-96.871956,Veracruz de Ignacio de la Llave
5,house,175,102763.0,20.689157,-103.366728,Jalisco


In [190]:
#concat datasets
df = pd.concat([df1, df2, df3])
df.head()

Unnamed: 0,property_type,state,lat,lon,area_m2,price_usd
0,house,Estado de México,19.560181,-99.233528,150.0,67965.56
1,house,Nuevo León,25.688436,-100.198807,186.0,63223.78
2,apartment,Guerrero,16.767704,-99.764383,82.0,84298.37
3,apartment,Guerrero,16.829782,-99.911012,150.0,94308.8
5,house,Yucatán,21.052583,-89.538639,205.0,105191.37


In [188]:
# Save the datasets
df.to_csv("data/mexico-real-estate-clean.csv", index=False)