In [34]:
import pandas as pd

In [35]:
# Collect Data

data = pd.read_csv('Datasets/kc_house_data.csv')

In [36]:
# Adjusting datetime format =============================================
data['date'] = pd.to_datetime(data['date']).dt.date

# treatig outliers ======================================================
data = data.sort_values('bedrooms', ascending=False).reset_index()
data.loc[0, 'bedrooms'] = 3

# convert areas to metric system ========================================
area_attributes = []
data['m2_living'] = data['sqft_living'] * 0.092903
data['m2_lot'] = data['sqft_lot'] * 0.092903
data['m2_above'] = data['sqft_above'] * 0.092903
data['m2_basement'] = data['sqft_basement'] * 0.092903
data['price_m2'] = data['price']/(data['sqft_lot'] * 0.092903)

data = data.drop(['sqft_living', 'sqft_living15',
                 'sqft_lot', 'sqft_lot15', 'sqft_above', 'sqft_basement', 'index'], axis=1)

# Gathering datetime info ===============================================
data['date_year'] = pd.to_datetime(data['date']).dt.year
data['date_month'] = pd.to_datetime(data['date']).dt.month
data['date_week'] = pd.to_datetime(data['date']).dt.week

# Defining seasonality ==================================================
data['season'] = data['date_month'].apply( lambda x: 'winter' if (x == 12 or x <= 2) else
                                                          'spring' if (3 <= x < 6) else
                                                          'summer' if (6 <= x <= 8) else 'Autumn')

# Checking Basement ===================================================== 
# data['basement'] = data['m2_basement'].apply( lambda x: 'Has Basement' if x != 0 else 'No Basement')
data['basement'] = data['m2_basement'].apply( lambda x: 1 if x != 0 else 0)

# create medians dataset ================================================
attributes = ['price', 'bedrooms','bathrooms', 'm2_living', 'm2_lot', 'floors',
              'view', 'condition', 'grade', 'm2_above', 'm2_basement', 'zipcode']

median_by_zipcode = data[attributes].groupby('zipcode').median().reset_index()
median_by_zipcode.columns = ['zipcode', 'median_price', 'median_bedrooms','median_bathrooms', 'median_m2_living', 'median_m2_lot', 'median_floors',
              'median_view', 'median_condition', 'median_grade', 'median_m2_above', 'median_m2_basement']

# create investment dataset =========================================
comparative_dataset = pd.merge(data, median_by_zipcode, on='zipcode', how='inner')
investment_dataset = comparative_dataset[(comparative_dataset['price'] < comparative_dataset['median_price']) &
           (comparative_dataset['condition'] > comparative_dataset['median_condition']) &
           (comparative_dataset['m2_lot'] > comparative_dataset['median_m2_lot']) &
           (comparative_dataset['m2_living'] > comparative_dataset['median_m2_living'])].reset_index()

# Measures ============================================================
yr_built_list = data['yr_built'].unique()
min_year_built = int(data['yr_built'].min())
max_year_built = int(data['yr_built'].max())
min_price = int( data['price'].min() )
max_price = int( data['price'].max() )
avg_price = int(data['price'].mean())
median_price = int(data['price'].median())
min_bedrooms = data['bedrooms'].min()
max_bedrooms = data['bedrooms'].max()
min_floors = data['floors'].min()
max_floors = data['floors'].max()
min_bathrooms = data['bathrooms'].min()
max_bathrooms = data['bathrooms'].max()


  data['date_week'] = pd.to_datetime(data['date']).dt.week


In [37]:
grouped = data[['zipcode','season', 'price']].groupby(['zipcode', 'season']).mean().reset_index()
grouped

Unnamed: 0,zipcode,season,price
0,98001,Autumn,285427.047059
1,98001,spring,280089.639640
2,98001,summer,276872.587629
3,98001,winter,281788.521739
4,98002,Autumn,237751.777778
...,...,...,...
275,98198,winter,297087.000000
276,98199,Autumn,788473.716049
277,98199,spring,819779.677083
278,98199,summer,808544.839080


In [38]:
df = grouped.loc[grouped.groupby(['zipcode'])['price'].idxmax()]
df

Unnamed: 0,zipcode,season,price
0,98001,Autumn,2.854270e+05
5,98002,spring,2.397367e+05
11,98003,winter,3.124446e+05
13,98004,spring,1.422263e+06
17,98005,spring,8.390754e+05
...,...,...,...
261,98177,spring,6.869634e+05
265,98178,spring,3.213618e+05
271,98188,winter,3.025694e+05
272,98198,Autumn,3.250861e+05


In [39]:
df.columns = ['zipcode', 'seasonality', 'average_price']

In [42]:
data = pd.merge(data, df, on='zipcode', how='inner')

In [45]:
data[['zipcode','seasonality', 'price']]

Unnamed: 0,zipcode,seasonality,price
0,98103,summer,640000.0
1,98103,summer,599999.0
2,98103,summer,550000.0
3,98103,summer,775000.0
4,98103,summer,760000.0
...,...,...,...
21608,98070,winter,340000.0
21609,98070,winter,285000.0
21610,98070,winter,369900.0
21611,98070,winter,650000.0
