### 0. Import Libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import plotly.express as px
import time
from multiprocessing import Pool

### 1. Data Collection

In [5]:
# Collect Data

data = pd.read_csv('Datasets/kc_house_data.csv')

### 2. Initial EDA

In [6]:
#data overview
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [7]:
#list data columns
data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [8]:
#list data types
data.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [9]:
# Supress scientific notation 
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [10]:
# Verifying duplicated IDs
data = data['id'].unique()
unique_ids = len(data)
len(data) - unique_ids

177

In [11]:
# Create dataset with unique IDs, keeping the most recent acordingly to 'date'

data_recent_only = data.sort_values('date', ascending=True).drop_duplicates(subset='id', keep='last')
data_recent_only.shape

(21436, 21)

In [12]:
# Descriptive Analysis
data.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580301520.9,540088.1,3.4,2.1,2079.9,15107.0,1.5,0.0,0.2,3.4,7.7,1788.4,291.5,1971.0,84.4,98077.9,47.6,-122.2,1986.6,12768.5
std,2876565571.3,367127.2,0.9,0.8,918.4,41420.5,0.5,0.1,0.8,0.7,1.2,828.1,442.6,29.4,401.7,53.5,0.1,0.1,685.4,27304.2
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.2,-122.5,399.0,651.0
25%,2123049194.0,321950.0,3.0,1.8,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.5,-122.3,1490.0,5100.0
50%,3904930410.0,450000.0,3.0,2.2,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.6,-122.2,1840.0,7620.0
75%,7308900445.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.7,-122.1,2360.0,10083.0
max,9900000190.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.8,-121.3,6210.0,871200.0


### 3. Data Cleaning

In [13]:
#Correcting date formats

data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d' )
data['yr_built'] = pd.to_datetime(data['yr_built'], format='%Y' )
data['yr_built'] = data['yr_built'].dt.year


In [14]:
# 33 Bedroom house outlier analysis

data = data[['id', 'bedrooms', 'price', 'sqft_living', 'sqft_above','zipcode', 'waterfront']].sort_values('bedrooms', ascending=False)
data

Unnamed: 0,id,bedrooms,price,sqft_living,sqft_above,zipcode,waterfront
15870,2402100895,33,640000.0,1620,1040,98103,0
8757,1773100755,11,520000.0,3000,2400,98106,0
15161,5566100170,10,650000.0,3610,3010,98006,0
13314,627300145,10,1148000.0,4590,2500,98004,0
19254,8812401450,10,660000.0,2920,1860,98105,0
...,...,...,...,...,...,...,...
19452,3980300371,0,142000.0,290,290,98024,0
8484,2310060040,0,240000.0,1810,1810,98038,0
875,6306400140,0,1095000.0,3064,3064,98102,0
8477,2569500210,0,339950.0,2290,2290,98042,0


In [15]:
data[(df['zipcode'] == 98103) & (df['bedrooms'] == 3)]

Unnamed: 0,id,bedrooms,price,sqft_living,sqft_above,zipcode,waterfront
14222,4083301120,3,705000.0,1440,1440,98103,0
21608,263000018,3,360000.0,1530,1530,98103,0
21421,993001961,3,374950.0,1390,1390,98103,0
13034,3126049261,3,259250.0,940,940,98103,0
20231,1972201964,3,500000.0,1420,1420,98103,0
...,...,...,...,...,...,...,...
9278,8129700644,3,780000.0,2080,2080,98103,0
9361,4310701575,3,429000.0,1410,1410,98103,0
249,3797001895,3,481000.0,1560,770,98103,0
12501,4083801395,3,780000.0,1970,1970,98103,0


In [16]:
# Houses in the same region, with no waterfront, bigger living area, fewer bedrooms are costing more than the refered house.
# By analysing houses within the same caracteristics and 3 bedroom, seems reasonable that 33 was a typo that should be only 3.

In [17]:
data = data.sort_values('bedrooms', ascending=False).reset_index()

In [18]:
data.loc[0, 'bedrooms'] = 3

### 4. Data Transformation

In [19]:
# Create Condition status based on condition level
    # if 'condition' <=2, ‘bad’
    # if 'condition' = 3 or 4, ‘regular’
    # if 'condition' =>5, ‘good’

data['condition_type'] = data['condition'].apply( lambda x: 'bad' if x <= 2 else 'regular' if x <= 4 else 'good')

In [20]:
# Drop unescessary columns

data = data.drop(['sqft_living15' , 'sqft_lot15'], axis=1)

In [21]:
# Defining Price Range and Price Range lvl

data['price_range'] = data['price'].apply( lambda x: 'up to $321950' if x < 321950 else
                                                     '$321950 to $450000' if x < 450000 else
                                                     '$450000 to $645000' if x < 645000 else
                                                     'from $645000')

data['price_cat'] = data['price'].apply( lambda x: 0 if x < 321950 else
                                                     1 if x < 450000 else
                                                     2 if x < 645000 else
                                                     3)

In [22]:
data

Unnamed: 0,index,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,condition_type,price_range,price_cat
0,15870,2402100895,2014-06-25,640000.0,3,1.8,1620,6000,1.0,0,...,1040,580,1947,0,98103,47.7,-122.3,good,$450000 to $645000,2
1,8757,1773100755,2014-08-21,520000.0,11,3.0,3000,4960,2.0,0,...,2400,600,1918,1999,98106,47.6,-122.4,regular,$450000 to $645000,2
2,15161,5566100170,2014-10-29,650000.0,10,2.0,3610,11914,2.0,0,...,3010,600,1958,0,98006,47.6,-122.2,regular,from $645000,3
3,13314,627300145,2014-08-14,1148000.0,10,5.2,4590,10920,1.0,0,...,2500,2090,2008,0,98004,47.6,-122.1,regular,from $645000,3
4,19254,8812401450,2014-12-29,660000.0,10,3.0,2920,3745,2.0,0,...,1860,1060,1913,0,98105,47.7,-122.3,regular,from $645000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,19452,3980300371,2014-09-26,142000.0,0,0.0,290,20875,1.0,0,...,290,0,1963,0,98024,47.5,-121.9,bad,up to $321950,0
21609,8484,2310060040,2014-09-25,240000.0,0,2.5,1810,5669,2.0,0,...,1810,0,2003,0,98038,47.3,-122.1,regular,up to $321950,0
21610,875,6306400140,2014-06-12,1095000.0,0,0.0,3064,4764,3.5,0,...,3064,0,1990,0,98102,47.6,-122.3,regular,from $645000,3
21611,8477,2569500210,2014-11-17,339950.0,0,2.5,2290,8319,2.0,0,...,2290,0,1985,0,98042,47.3,-122.2,regular,$321950 to $450000,1


#### 4.1 Getting Geographical Info

In [23]:
#Create lat + long column
data['latlong'] = data[['lat', 'long']].apply( lambda x: str(x['lat']) + ',' + str(x['long']), axis=1)


In [None]:
#Create new attribute empty columns

data['neighbourhood'] = 'NA'
data['city'] = 'NA'
data['state'] = 'NA'

In [None]:
# %%writefile defs_newatt.py
# # THIS FUNCION WILL BE SAVED AS A .PY FILE NAMED 'defs'

# import time
# import geopy.geocoders
# from geopy.geocoders import Nominatim
# import certifi
# import ssl

# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# geopy.geocoders.options.default_ssl_context = ctx

# geolocator = Nominatim( user_agent='geopyExercises')

# def get_data(x):
#     index, row = x
#     time.sleep(10)
   
#     #API request
#     response = geolocator.reverse( row['latlong'] )
#     address = response.raw['address']

#     neighbourhood = address['neighbourhood'] if 'neighbourhood' in address else 'NA'
#     city = address['city'] if 'city' in address else 'NA'
#     state = address['state'] if 'state' in address else 'NA'

#     return neighbourhood, city, state

In [None]:
# import defs_newatt

In [None]:
# df = data[['id', 'latlong']].head(100)
# p = Pool(4)

# start = time.process_time()
# df[['neighbourhood', 'city', 'state']] = p.map( defs_newatt.get_data, df.iterrows())
# end = time.process_time()

# print('Time Elapsed: {}', end - start)

### 5. Descriptive Statistics and Metrics

In [36]:
attributes = ['price','bedrooms', 'sqft_living', 'sqft_lot', 'condition', 'grade', 'zipcode']

mean_by_region = data[attributes].groupby( 'zipcode' ).mean().reset_index()
median_by_region = data[attributes].groupby( 'zipcode' ).median().reset_index()
max_by_region = data[attributes].groupby( 'zipcode' ).max().reset_index()
min_by_region = data[attributes].groupby( 'zipcode' ).min().reset_index()
std_by_region = data[attributes].groupby( 'zipcode' ).std().reset_index()


In [37]:
median_by_region.columns

Index(['zipcode', 'price', 'bedrooms', 'sqft_living', 'sqft_lot', 'condition',
       'grade'],
      dtype='object')

In [38]:
median_by_region.columns = ['zipcode', 'median_price', 'median_bedrooms', 'median_sqft_living', 'median_sqft_lot', 'median_condition', 'median_grade']

In [39]:
median_by_region

Unnamed: 0,zipcode,median_price,median_bedrooms,median_sqft_living,median_sqft_lot,median_condition,median_grade
0,98001,260000.0,3.0,1822.0,9287.0,3.0,7.0
1,98002,235000.0,3.0,1570.0,7282.0,4.0,7.0
2,98003,267475.0,3.0,1780.0,8816.0,3.0,7.0
3,98004,1150000.0,4.0,2660.0,11119.0,3.0,9.0
4,98005,765475.0,4.0,2505.0,12220.0,4.0,8.0
...,...,...,...,...,...,...,...
65,98177,554000.0,3.0,2120.0,8210.0,3.0,8.0
66,98178,278277.0,3.0,1725.0,7200.0,3.0,7.0
67,98188,264000.0,3.0,1690.0,8913.0,3.0,7.0
68,98198,265000.0,3.0,1610.0,8589.5,3.0,7.0


In [42]:
oldest_listed = data['date'].min()
newest_listed = data['date'].max()
max_price = data['price'].max()
min_price = data['price'].min()
num_bedrooms = sorted(data['bedrooms'].unique())
median_pric =data['price'].median()


### 4.2 Opportunities Dataset

In [43]:
data = pd.merge( data, median_by_region, on='zipcode', how='inner')

### 6. Data Visualization

In [44]:
mapa = px.scatter_mapbox( data,
                          lat='lat',
                          lon='long',
                          color='price_cat',
                          size='price',
                          hover_name='id',
                          hover_data=['price'],
                          color_discrete_sequence=['darkgreen'],
                          #color_continuous_scale=px.colors.cyclical.IceFire,
                          zoom=9,
                          height=300)

mapa.update_layout( mapbox_style='open-street-map')
mapa.update_layout( height=600, margin={'r':0, 't':0,'l':0,'b':0 })
mapa.show()

In [45]:
opp = data[(data['price'] < data['median_price']) &
         (data['condition'] > data['median_condition']) &
         (data['sqft_lot'] > data['median_sqft_lot']) &
         (data['sqft_living'] > data['median_sqft_living']) ]

In [46]:
mapa = px.scatter_mapbox( opp,
                          lat='lat',
                          lon='long',
                          color='price_cat',
                          size='price',
                          hover_name='id',
                          hover_data=['price'],
                          color_discrete_sequence=['darkgreen'],
                          #color_continuous_scale=px.colors.cyclical.IceFire,
                          zoom=9,
                          height=300)

mapa.update_layout( mapbox_style='open-street-map')
mapa.update_layout( height=600, margin={'r':0, 't':0,'l':0,'b':0 })
mapa.show()