In [6]:
# import libraries for hypotheses testing and visualization and read cleaned csv-file to dataframe

import folium
import pandas as pd
from matplotlib import colormaps, colors

df = pd.read_csv('eda_clean.csv')

#### Visualizing house prices and city center on map of Seattle

In [7]:
# # normalize price column for colormapping
# norm = colors.Normalize(vmin=df["price"].quantile(q=0.25), vmax=df["price"].quantile(q=0.75))
# # choose colormap from matplotlib colormaps
# cmap = colormaps["coolwarm"]

# # create map using folium with coordinates of Seattle city center
# city_center = [47.6050242, -122.3343709]
# m = folium.Map(location=city_center, zoomstart=12)

# # adding circles at house locations with color depending on house price
# for _, row in df.iterrows():

#     # assigning color from colormap matching with price
#     color = colors.to_hex(cmap(norm(row['price'])))
    
#     # add circle
#     folium.CircleMarker(
#         location=[row['lat'], row['long']],                                                     # location
#         radius=2,                                                                               # size of circle
#         popup=f"Price: $ {row['price']:,}\nDistance Center: {round(row['dist_center'], 2)}",    # popup message including price and distance
#         color=color,                                                                            # assigned circle color
#         fill=True,                                                                              # filled
#         fill_opacity=0.3                                                                        # with certain filling opacity
#     ).add_to(m)

# # add circle for city center
# folium.CircleMarker(
#     location=city_center,
#     radius=15,
#     popup="City Center",
#     color="black",
#     fill=True,
#     fill_opacity=0.6
# ).add_to(m)

# # showing map
# m

# Hypothesis 1
#### Houses sold below median price are located further away from the city center.

In [8]:
# filter for houses which were sold for a price below the median price of all houses
low_price_houses = df[df['price'] < df['price'].median()]
# calculate median distance of these houses to the city center
median_dist_low = low_price_houses['dist_center'].median()

# filter for houses which were sold for a price above or equal to the median price of all houses
high_price_houses = df[df['price'] >= df['price'].median()]
# calculate median distance of these houses to the city center
median_dist_high = high_price_houses['dist_center'].median()

print(f"Median distance to city center for houses sold below median price: {round(median_dist_low, 3)} km")
print(f"Median distance to city center for houses sold above or equal to median price: {round(median_dist_high, 3)} km")

Median distance to city center for houses sold below median price: 19.463 km
Median distance to city center for houses sold above or equal to median price: 14.003 km


-> Hypothesis 1 is true.

# Hypothesis 2
#### Houses that were built before 1990 and that are in poor condition have a median price that is 10% lower than that of other houses.

In [9]:
# filter for houses with condition below 3 AND that were built before 1990
old_poor_condition_houses = df[(df['condition'] < 3) & (df['yr_built'] < 1990)]
# filter for all other houses
other_houses = df[(df['condition'] >= 3) | (df['yr_built'] >= 1990)]
# calculate median for both groups of houses
old_poor_median = old_poor_condition_houses['price'].median()
other_median = other_houses['price'].median()
# calculate percentage of median price of old houses in poor condition to median price of other houses
ratio = old_poor_median / other_median
print(f"Median price of houses in poor condition and built before 1990: {old_poor_median}$")
print(f"Median price of other houses: {other_median}$")
print(f"Median price of old houses is {round((1 - ratio) * 100, 2)}% lower than median price of other houses.")
print()

# old_houses = df[df['condition'] < 3]
# other_houses = df[df['condition'] >= 3]
# old_median = old_houses['price'].median()
# other_median = other_houses['price'].median()
# diff = old_median / other_median
# print(old_median)
# print(other_median)
# print(diff)
# print()

# check number of houses for different filtering conditions
poor_condition_houses = df[df['condition'] < 3]
old_houses = df[df['yr_built'] < 1990]
print(f"Number of houses in poor condition: {poor_condition_houses.shape[0]}")
print(f"Number of houses built before 1990: {old_houses.shape[0]}")
print(f"Number of houses for both filtering conditions: {old_poor_condition_houses.shape[0]}")

Median price of houses in poor condition and built before 1990: 279000.0$
Median price of other houses: 451000.0$
Median price of old houses is 38.14% lower than median price of other houses.

Number of houses in poor condition: 199
Number of houses built before 1990: 14609
Number of houses for both filtering conditions: 196


-> Hypothesis 2 is false. It is 38.14% and not 10%.

Note: As most houses in poor condition were built before 1990 (only 3 houses in poor condition were built after 1990), the values for houses in poor condition is identical to the values shown before.

# Hypothesis 3
#### Houses sold in winter have a median price that is 10% lower than houses sold during other seasons.

In [10]:
# filter for houses sold in winter
winter_houses = df[df['season'] == 'winter']
# filter for houses that were sold in other seasons
other_houses = df[df['season'] != 'winter']

# calculate median price for both groups
winter_median = winter_houses['price'].median()
other_median = other_houses['price'].median()
# calculate percentage of median price of houses sold in winter to houses sold in other seasons
ratio = winter_median / other_median

print(f"Median price of houses sold in winter: {winter_median}$")
print(f"Median price of houses sold during other seasons: {other_median}$")
print(f"Median price of houses sold in winter is {round((1 - ratio) * 100, 2)}% lower than median price of other houses.\n\n\n")

for season in ['spring', 'summer', 'fall', 'winter']:
    season_median = df[df['season'] == season]['price'].median()
    other_median = df[df['season'] != season]['price'].median()
    print(f"Median price for {season}: {season_median}")
    print(f"Median price for other seasons: {other_median}")
    print(f"Ratio other seasons to {season}: {round((season_median / other_median) * 100, 2)}\n")

Median price of houses sold in winter: 430000.0$
Median price of houses sold during other seasons: 455000.0$
Median price of houses sold in winter is 5.49% lower than median price of other houses.



Median price for spring: 465000.0
Median price for other seasons: 446000.0
Ratio other seasons to spring: 104.26

Median price for summer: 455000.0
Median price for other seasons: 450000.0
Ratio other seasons to summer: 101.11

Median price for fall: 443725.0
Median price for other seasons: 452500.0
Ratio other seasons to fall: 98.06

Median price for winter: 430000.0
Median price for other seasons: 455000.0
Ratio other seasons to winter: 94.51



-> Hypothesis 3 is false. It is 5.49% and not 10%.

Note: Other seasons were also tested in the same way. Houses sold in winter show the highest difference in median price to houses sold in other seasons.