# Ames Housing Data

```
curl -O http://jse.amstat.org/v19n3/decock/DataDocumentation.txt
curl -o AmesHousing.tsv http://jse.amstat.org/v19n3/decock/AmesHousing.txt
```

http://jse.amstat.org/v19n3/decock.pdf
    
> This paper presents a data set describing the sale of individual residential property in Ames, Iowa
from 2006 to 2010. The data set contains 2930 observations and a large number of explanatory
variables (23 nominal, 23 ordinal, 14 discrete, and 20 continuous) involved in assessing home
values.


> Extensive EDA (ranges of values, missing values, analysis of target variable, feature importance analysis)
For images: analyzing the content of the images. For texts: frequent words, word clouds, etc

In [1]:
!pip install blackcellmagic





In [2]:
%load_ext blackcellmagic

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

random_state = 42

df = pd.read_table(
    "http://jse.amstat.org/v19n3/decock/AmesHousing.txt",
    index_col="Order",
)

df.columns = df.columns.str.lower().str.replace(" ", "_")

df["central_air"] = df["central_air"] == "Y"
df["paved_drive"] = df["paved_drive"] == "Y"

# # Index(['pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area', 'street',
#        'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config',
#        'land_slope', 'neighborhood', 'condition_1', 'condition_2', 'bldg_type',
#        'house_style', 'overall_qual', 'overall_cond', 'year_built',
#        'year_remod/add', 'roof_style', 'roof_matl', 'exterior_1st',
#        'exterior_2nd', 'mas_vnr_type', 'mas_vnr_area', 'exter_qual',
#        'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
#        'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2', 'bsmtfin_sf_2',
#        'bsmt_unf_sf', 'total_bsmt_sf', 'heating', 'heating_qc', 'central_air',
#        'electrical', '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf',
#        'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath',
#        'half_bath', 'bedroom_abvgr', 'kitchen_abvgr', 'kitchen_qual',
#        'totrms_abvgrd', 'functional', 'fireplaces', 'fireplace_qu',
#        'garage_type', 'garage_yr_blt', 'garage_finish', 'garage_cars',
#        'garage_area', 'garage_qual', 'garage_cond', 'paved_drive',
#        'wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch',
#        'screen_porch', 'pool_area', 'pool_qc', 'fence', 'misc_feature',
#        'misc_val', 'mo_sold', 'yr_sold', 'sale_type', 'sale_condition',
#        'saleprice'],
bmst_columns = [
    "bsmtfin_sf_1",
    "bsmtfin_sf_2",
    "bsmt_unf_sf",
    "total_bsmt_sf",
    "bsmt_full_bath",
    "bsmt_half_bath",
]

mask = df["bsmt_qual"] == "NA"
#df.loc[mask, bmst_columns] = df.loc[mask, bmst_columns].fillna(0)
#df.loc[mask, 'c'] = df.loc[mask, 'c'].fillna(mean)

df[mask]

#df["bsmt_qual" == "NA"]
# na_columns = [
#     "alley",
#     "bsmt_qual",
#     "bsmt_cond",
#     "bsmt_exposure",
#     "bsmtfin_type_1",
#     "bsmtfin_type_2",
#     "fireplace_qu",
#     "garage_type",
#     "garage_finish",
#     "garage_qual",
#     "garage_cond",
#     "pool_qc",
#     "fence",
#     "misc_feature",
# ]
# df[na_columns] = df[na_columns].fillna("NA")


# Mas Vnr Type -> None if nan
# mas vnr area -> 0.0 if None

# TODO if bsmt_qual is "NA" then set these to 0 -- also could be Slab and PConc?
# bsmt_sf_1, bsmt_sf_2, bsmt_unf_sf, total_bsmt_sf, bsmt_full_bath, bsmt_half_bath
# df[df.bsmtfin_sf_1.isna()]

# Drop lot frontage and garage yr blt columns
# Alt, set garage_yr_blt to NA for those entries and perhaps 0.0 for lot_frontage?
# TODO keep these in the data set. seems like xgboost might handle these well and perhaps need to impute for lasso and others
# see https://towardsdatascience.com/xgboost-is-not-black-magic-56ca013144b4

# df = df.dropna()
# df[["lot_frontage","garage_yr_blt"]] = df[["lot_frontage","garage_yr_blt"]].astype(int)

# df_full_train, df_test = train_test_split(
#     df,
#     test_size=0.2,
#     random_state=random_state,
# )
# df_train, df_val = train_test_split(
#     df_full_train, test_size=0.25, random_state=random_state
# )

# df_full_train = df_full_train.reset_index(drop=True)
# df_train = df_train.reset_index(drop=True)
# df_val = df_val.reset_index(drop=True)
# df_test = df_test.reset_index(drop=True)

Unnamed: 0_level_0,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
