# Data Processing
### Importing the libraries

In [1]:
import pandas as pd
import numpy as np

### Loading the data

In [2]:
data = pd.read_csv('../1 - Original data/houses_Madrid.csv')

### Processing the data
#### - Deleting unnecessary columns

In [3]:
# Creating a list with the name of the columns that we will delete
columns_to_delete: list = ['Unnamed: 0', 'id', 'title', 'subtitle', 'sq_mt_allotment', 'sq_mt_useful', 'latitude', 'longitude', 'raw_address', 
                           'is_exact_address_hidden', 'street_name', 'street_number', 'portal', 'floor', 'is_floor_under',
                           'door', 'neighborhood_id', 'operation', 'rent_price', 'rent_price_by_area', 'is_rent_price_known',
                           'buy_price_by_area', 'is_buy_price_known', 'built_year', 'has_central_heating', 'has_individual_heating',
                           'are_pets_allowed', 'has_ac', 'has_fitted_wardrobes', 'has_lift', 'is_exterior', 
                           'has_garden', 'has_pool', 'has_terrace', 'has_balcony', 'has_storage_room', 'is_furnished', 
                           'is_kitchen_equipped', 'is_accessible', 'has_green_zones', 'energy_certificate', 'has_parking',
                           'has_private_parking', 'has_public_parking', 'is_parking_included_in_price', 'parking_price',
                           'is_orientation_north', 'is_orientation_west', 'is_orientation_south', 'is_orientation_east']

In [4]:
# Using a for loop to delete the columns
for column in columns_to_delete:
    del data[column]

In [5]:
# Seeing the new database
data.tail()

Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,n_floors,buy_price,house_type_id,is_renewal_needed,is_new_development
21737,78.0,2,2.0,,350000,HouseType 5: Áticos,False,False
21738,96.0,2,2.0,,425000,HouseType 1: Pisos,False,False
21739,175.0,4,2.0,,680000,HouseType 1: Pisos,False,False
21740,289.0,4,3.0,3.0,695000,HouseType 2: Casa o chalet,False,False
21741,72.0,2,2.0,,424000,HouseType 1: Pisos,False,True


#### - Dealing with missing data and creating dummies columns

In [6]:
'''
    Treating the missing data in the n_floors column.
    We will replace the missing values in the column with the number 1, suggesting that such properties 
    have only 1 floor. 
'''
data['n_floors'] = data['n_floors'].replace(np.nan, 1)

In [7]:
# Creating dummies columns with the house_type_id column
data = pd.get_dummies(data, columns = ['house_type_id'])

In [8]:
# Seeing the new database
data.tail()

Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,n_floors,buy_price,is_renewal_needed,is_new_development,house_type_id_HouseType 1: Pisos,house_type_id_HouseType 2: Casa o chalet,house_type_id_HouseType 4: Dúplex,house_type_id_HouseType 5: Áticos
21737,78.0,2,2.0,1.0,350000,False,False,0,0,0,1
21738,96.0,2,2.0,1.0,425000,False,False,1,0,0,0
21739,175.0,4,2.0,1.0,680000,False,False,1,0,0,0
21740,289.0,4,3.0,3.0,695000,False,False,0,1,0,0
21741,72.0,2,2.0,1.0,424000,False,True,1,0,0,0


In [9]:
# Droping na values
data.dropna(inplace = True)

### Saving the processed data

In [10]:
data.to_csv('data.csv')