### 0. Import Libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import plotly.express as px
import time
from multiprocessing import Pool
import seaborn as sea

### 1. Data Collection

In [2]:
# Collect Data

data = pd.read_csv('Datasets/kc_house_data.csv')

In [3]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
# convert to metric system =====================
data = data.sort_values('bedrooms', ascending=False).reset_index()
data.loc[0, 'bedrooms'] = 3

data['m2_living'] = data['sqft_living'] * 0.092903
data['m2_lot'] = data['sqft_lot'] * 0.092903
data['price_m2'] = data['price']/(data['sqft_lot'] * 0.092903)

attributes = ['price', 'bedrooms', 'm2_living',
              'm2_lot', 'condition', 'grade', 'zipcode']


median_by_region = data[attributes].groupby('zipcode').median().reset_index()
median_by_region.columns = ['zipcode', 'median_price', 'median_bedrooms', 'median_m2_living',
                            'median_m2_lot', 'median_condition', 'median_grade']

data = pd.merge(data, median_by_region, on='zipcode', how='inner')

data = data.drop(['sqft_living', 'sqft_living15',
                 'sqft_lot', 'sqft_lot15', 'index'], axis=1)


# treatig outliers ==============================

data

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,view,condition,grade,...,long,m2_living,m2_lot,price_m2,median_price,median_bedrooms,median_m2_living,median_m2_lot,median_condition,median_grade
0,2402100895,20140625T000000,640000.0,3,1.75,1.0,0,0,5,7,...,-122.331,150.502860,557.418000,1148.150939,550000.0,3.0,139.819015,325.16050,3.0,7.0
1,1997200215,20140507T000000,599999.0,9,4.50,2.5,0,0,3,7,...,-122.338,355.818490,649.206164,924.204102,550000.0,3.0,139.819015,325.16050,3.0,7.0
2,263000324,20140513T000000,550000.0,7,4.00,2.0,0,0,3,7,...,-122.349,319.586320,752.514300,730.883121,550000.0,3.0,139.819015,325.16050,3.0,7.0
3,9550202010,20140710T000000,775000.0,6,2.75,1.5,0,0,3,7,...,-122.331,276.850940,464.515000,1668.406833,550000.0,3.0,139.819015,325.16050,3.0,7.0
4,993000100,20150410T000000,760000.0,6,3.75,2.0,0,0,4,8,...,-122.340,353.960430,571.353450,1330.174868,550000.0,3.0,139.819015,325.16050,3.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,222029026,20140917T000000,340000.0,2,0.75,1.0,1,2,5,6,...,-122.511,98.477180,4486.471676,75.783383,463750.0,3.0,174.193125,4099.80939,4.0,7.0
21609,2781600195,20141117T000000,285000.0,1,1.00,1.0,1,4,3,5,...,-122.445,98.477180,5095.357938,55.933264,463750.0,3.0,174.193125,4099.80939,4.0,7.0
21610,2123039032,20141027T000000,369900.0,1,0.75,1.0,1,4,5,5,...,-122.438,70.606280,936.369337,395.036430,463750.0,3.0,174.193125,4099.80939,4.0,7.0
21611,3023039231,20140714T000000,650000.0,1,1.00,1.5,0,0,3,6,...,-122.472,85.470760,8498.394828,76.485032,463750.0,3.0,174.193125,4099.80939,4.0,7.0


In [5]:
num_attributes = data.select_dtypes(include=['int64', 'float64'])
num_attributes = num_attributes.drop('id', axis=1)
mean_ = pd.DataFrame(num_attributes.apply(np.mean))
median_ = pd.DataFrame(num_attributes.apply(np.median))
std_ = pd.DataFrame(num_attributes.apply(np.std))

max_ = pd.DataFrame(num_attributes.apply(np.max))
min_ = pd.DataFrame(num_attributes.apply(np.min))

descriptive_stats = pd.concat(
    [max_, min_, mean_, median_, std_], axis=1).reset_index()


In [6]:
descriptive_stats.columns = ['Attributes', 'Max',
                             'Min', 'Mean', 'Median', 'Std. Deviation']

In [7]:
descriptive_stats

Unnamed: 0,Attributes,Max,Min,Mean,Median,Std. Deviation
0,price,7700000.0,75000.0,540088.141767,450000.0,367118.703181
1,bedrooms,11.0,0.0,3.369454,3.0,0.907943
2,bathrooms,8.0,0.0,2.114757,2.25,0.770145
3,floors,3.5,1.0,1.494309,1.5,0.539976
4,waterfront,1.0,0.0,0.007542,0.0,0.086515
5,view,4.0,0.0,0.234303,0.0,0.7663
6,condition,5.0,1.0,3.40943,3.0,0.650728
7,grade,13.0,1.0,7.656873,7.0,1.175432
8,sqft_above,9410.0,290.0,1788.390691,1560.0,828.07182
9,sqft_basement,4820.0,0.0,291.509045,0.0,442.564804


In [8]:
descriptive_stats = descriptive_stats.iloc[:18]
descriptive_stats.reset_index()


Unnamed: 0,index,Attributes,Max,Min,Mean,Median,Std. Deviation
0,0,price,7700000.0,75000.0,540088.141767,450000.0,367118.703181
1,1,bedrooms,11.0,0.0,3.369454,3.0,0.907943
2,2,bathrooms,8.0,0.0,2.114757,2.25,0.770145
3,3,floors,3.5,1.0,1.494309,1.5,0.539976
4,4,waterfront,1.0,0.0,0.007542,0.0,0.086515
5,5,view,4.0,0.0,0.234303,0.0,0.7663
6,6,condition,5.0,1.0,3.40943,3.0,0.650728
7,7,grade,13.0,1.0,7.656873,7.0,1.175432
8,8,sqft_above,9410.0,290.0,1788.390691,1560.0,828.07182
9,9,sqft_basement,4820.0,0.0,291.509045,0.0,442.564804


In [11]:
descriptive_stats

Unnamed: 0,Attributes,Max,Min,Mean,Median,Std. Deviation
0,price,7700000.0,75000.0,540088.141767,450000.0,367118.703181
1,bedrooms,11.0,0.0,3.369454,3.0,0.907943
2,bathrooms,8.0,0.0,2.114757,2.25,0.770145
3,floors,3.5,1.0,1.494309,1.5,0.539976
4,waterfront,1.0,0.0,0.007542,0.0,0.086515
5,view,4.0,0.0,0.234303,0.0,0.7663
6,condition,5.0,1.0,3.40943,3.0,0.650728
7,grade,13.0,1.0,7.656873,7.0,1.175432
8,sqft_above,9410.0,290.0,1788.390691,1560.0,828.07182
9,sqft_basement,4820.0,0.0,291.509045,0.0,442.564804


In [12]:
descriptive_stats.drop([13,14], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  descriptive_stats.drop([13,14], inplace=True)


In [18]:
descriptive_stats.reset_index(inplace=True)

In [21]:
descriptive_stats = descriptive_stats.drop(['index'], axis=1)

In [22]:
descriptive_stats

Unnamed: 0,Attributes,Max,Min,Mean,Median,Std. Deviation
0,price,7700000.0,75000.0,540088.141767,450000.0,367118.703181
1,bedrooms,11.0,0.0,3.369454,3.0,0.907943
2,bathrooms,8.0,0.0,2.114757,2.25,0.770145
3,floors,3.5,1.0,1.494309,1.5,0.539976
4,waterfront,1.0,0.0,0.007542,0.0,0.086515
5,view,4.0,0.0,0.234303,0.0,0.7663
6,condition,5.0,1.0,3.40943,3.0,0.650728
7,grade,13.0,1.0,7.656873,7.0,1.175432
8,sqft_above,9410.0,290.0,1788.390691,1560.0,828.07182
9,sqft_basement,4820.0,0.0,291.509045,0.0,442.564804


In [None]:
f_attributes = ['id', 'price', 'bedrooms', 'bathrooms', 'm2_living', 'm2_lot', 'zipcode']

In [None]:
df = data[f_attributes]

In [None]:
df

In [None]:
df = df[float_columns in df[f_attributes]]

In [None]:
df = df.style.format(subset=float_columns in df.columns, formatter="{:.2f}")

In [None]:
df

### 2. Initial EDA

In [None]:
#data overview
data.head()

In [None]:
#list data columns
data.columns

In [None]:
#list data types
data.dtypes

In [None]:
# Supress scientific notation 
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [None]:
# Verifying duplicated IDs
data = data['id'].unique()
unique_ids = len(data)
len(data) - unique_ids

In [None]:
# Create dataset with unique IDs, keeping the most recent acordingly to 'date'

data_recent_only = data.sort_values('date', ascending=True).drop_duplicates(subset='id', keep='last')
data_recent_only.shape

In [None]:
# Descriptive Analysis
data.describe()

### 3. Data Cleaning

In [None]:
#Correcting date formats

data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d' )
data['yr_built'] = pd.to_datetime(data['yr_built'], format='%Y' )
data['yr_built'] = data['yr_built'].dt.year


In [None]:
# 33 Bedroom house outlier analysis

data = data[['id', 'bedrooms', 'price', 'sqft_living', 'sqft_above','zipcode', 'waterfront']].sort_values('bedrooms', ascending=False)
data

In [None]:
data[(df['zipcode'] == 98103) & (df['bedrooms'] == 3)]

In [None]:
# Houses in the same region, with no waterfront, bigger living area, fewer bedrooms are costing more than the refered house.
# By analysing houses within the same caracteristics and 3 bedroom, seems reasonable that 33 was a typo that should be only 3.

In [None]:
data = data.sort_values('bedrooms', ascending=False).reset_index()

In [None]:
data.loc[0, 'bedrooms'] = 3

### 4. Data Transformation

In [None]:
# Create Condition status based on condition level
    # if 'condition' <=2, ‘bad’
    # if 'condition' = 3 or 4, ‘regular’
    # if 'condition' =>5, ‘good’

data['condition_type'] = data['condition'].apply( lambda x: 'bad' if x <= 2 else 'regular' if x <= 4 else 'good')

In [None]:
# Drop unescessary columns

data = data.drop(['sqft_living15' , 'sqft_lot15'], axis=1)

In [None]:
# Defining Price Range and Price Range lvl

data['price_range'] = data['price'].apply( lambda x: 'up to $321950' if x < 321950 else
                                                     '$321950 to $450000' if x < 450000 else
                                                     '$450000 to $645000' if x < 645000 else
                                                     'from $645000')

data['price_cat'] = data['price'].apply( lambda x: 0 if x < 321950 else
                                                     1 if x < 450000 else
                                                     2 if x < 645000 else
                                                     3)

In [None]:
data

#### 4.1 Getting Geographical Info

In [None]:
#Create lat + long column
data['latlong'] = data[['lat', 'long']].apply( lambda x: str(x['lat']) + ',' + str(x['long']), axis=1)


In [None]:
#Create new attribute empty columns

data['neighbourhood'] = 'NA'
data['city'] = 'NA'
data['state'] = 'NA'

In [None]:
# %%writefile defs_newatt.py
# # THIS FUNCION WILL BE SAVED AS A .PY FILE NAMED 'defs'

# import time
# import geopy.geocoders
# from geopy.geocoders import Nominatim
# import certifi
# import ssl

# ctx = ssl.create_default_context()
# ctx.check_hostname = False
# ctx.verify_mode = ssl.CERT_NONE
# geopy.geocoders.options.default_ssl_context = ctx

# geolocator = Nominatim( user_agent='geopyExercises')

# def get_data(x):
#     index, row = x
#     time.sleep(10)
   
#     #API request
#     response = geolocator.reverse( row['latlong'] )
#     address = response.raw['address']

#     neighbourhood = address['neighbourhood'] if 'neighbourhood' in address else 'NA'
#     city = address['city'] if 'city' in address else 'NA'
#     state = address['state'] if 'state' in address else 'NA'

#     return neighbourhood, city, state

In [None]:
# import defs_newatt

In [None]:
# df = data[['id', 'latlong']].head(100)
# p = Pool(4)

# start = time.process_time()
# df[['neighbourhood', 'city', 'state']] = p.map( defs_newatt.get_data, df.iterrows())
# end = time.process_time()

# print('Time Elapsed: {}', end - start)

### 5. Descriptive Statistics and Metrics

In [None]:
attributes = ['price','bedrooms', 'sqft_living', 'sqft_lot', 'condition', 'grade', 'zipcode']

mean_by_region = data[attributes].groupby( 'zipcode' ).mean().reset_index()
median_by_region = data[attributes].groupby( 'zipcode' ).median().reset_index()
max_by_region = data[attributes].groupby( 'zipcode' ).max().reset_index()
min_by_region = data[attributes].groupby( 'zipcode' ).min().reset_index()
std_by_region = data[attributes].groupby( 'zipcode' ).std().reset_index()


In [None]:
median_by_region.columns

In [None]:
median_by_region.columns = ['zipcode', 'median_price', 'median_bedrooms', 'median_sqft_living', 'median_sqft_lot', 'median_condition', 'median_grade']

In [None]:
median_by_region

In [None]:
oldest_listed = data['date'].min()
newest_listed = data['date'].max()
max_price = data['price'].max()
min_price = data['price'].min()
num_bedrooms = sorted(data['bedrooms'].unique())
median_pric =data['price'].median()


### 4.2 Opportunities Dataset

In [None]:
data = pd.merge( data, median_by_region, on='zipcode', how='inner')

### 6. Data Visualization

In [None]:
mapa = px.scatter_mapbox( data,
                          lat='lat',
                          lon='long',
                          color='price_cat',
                          size='price',
                          hover_name='id',
                          hover_data=['price'],
                          color_discrete_sequence=['darkgreen'],
                          #color_continuous_scale=px.colors.cyclical.IceFire,
                          zoom=9,
                          height=300)

mapa.update_layout( mapbox_style='open-street-map')
mapa.update_layout( height=600, margin={'r':0, 't':0,'l':0,'b':0 })
mapa.show()

In [None]:
opp = data[(data['price'] < data['median_price']) &
         (data['condition'] > data['median_condition']) &
         (data['sqft_lot'] > data['median_sqft_lot']) &
         (data['sqft_living'] > data['median_sqft_living']) ]

In [None]:
mapa = px.scatter_mapbox( opp,
                          lat='lat',
                          lon='long',
                          color='price_cat',
                          size='price',
                          hover_name='id',
                          hover_data=['price'],
                          color_discrete_sequence=['darkgreen'],
                          #color_continuous_scale=px.colors.cyclical.IceFire,
                          zoom=9,
                          height=300)

mapa.update_layout( mapbox_style='open-street-map')
mapa.update_layout( height=600, margin={'r':0, 't':0,'l':0,'b':0 })
mapa.show()

In [None]:
dataframe = data[['bedrooms','id']].groupby('bedrooms').count().reset_index()
dataframe

In [None]:
rows = dataframe.shape[0]
rows

In [None]:
teste = dataframe[dataframe['bedrooms'] <= 5]
teste

In [None]:
fig = px.bar( dataframe.head(11), x='bedrooms', y='id', text_auto=True, color='bedrooms', color_continuous_scale=px.colors.sequential.YlOrRd )
fig.update_layout(bargap=0.2)
fig

In [None]:
sea.countplot(data=data, x='bedrooms')