In [131]:
import sys
sys.path.append("../")

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

# TODO: be cautious on submission and test py file itself


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [124]:
from IMLearn.utils import split_train_test
from IMLearn.learners.regressors import LinearRegression

from typing import NoReturn
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"


def load_data(filename: str):
    """
    Load house prices dataset and preprocess data.
    Parameters
    ----------
    filename: str
        Path to house prices dataset

    Returns
    -------
    Design matrix and response vector (prices) - either as a single
    DataFrame or a Tuple[DataFrame, Series]
    """
    
    
    """
    preprocessing:
    id: probably irrelevant and uncorrelated with the price. dropped
    date: as the data spans over 1 year, prices probably does not differ by much in this time range. 
    Therefore, this feature is probably less relevant. To reduce noise, I've decided to drop it (even though it's also ok to leave it).
    waterfront: only 0.77% has waterfront=True. which means there may not be enough data to learn from this feature. I currently use it, however, It may not help by much.
    zipcode: nearly all samples start with the same prefix '98', which corresponds to kings county area. however, they do differ in the third digit, which indicates region.
    This data may be meaningful and less noisy. so I create 1-hot-encoding column for each of the third possible digits (and drop one of them to avoid the dummy variable trap)
    I also remove from train any sample which zipcode does not start with '98' (possibly malformed).
    TODO: when do we remove from train?
    long, lat: decided not to use it. pretty similar to zipcode. does not add any linear information in its original format. 
    we can do the same as we did in zipcode, but it probably won't help much as we already use zipcode.
    
    two additional columns are 'sqft_living_ratio' and 'sqft_lot_ratio' - these are the ratios between the house's lot/living size and the avg of the 15 closest houses.
    these may reveal new patters. e.g., if a house is bigger than its surrounding houses, it may be relfected in its price.
    
    """
    df = pd.read_csv(filename, parse_dates=True)
    prices = df['price']
    df['sqft_living_ratio'] = df['sqft_living'] / df['sqft_living15']
    df['sqft_lot_ratio'] = df['sqft_lot'] / df['sqft_lot15']    
    regions_df = pd.get_dummies(df['zipcode'].apply(lambda zipcode: -1 if len(str(zipcode)) < 3 else str(zipcode)[2]), prefix='region', drop_first=True)
    df = pd.concat((df, regions_df), axis=1)
    df = df.drop(columns=['id', 'date', 'zipcode', 'lat', 'long', 'price'])
    return df, prices




In [125]:

def feature_evaluation(X: pd.DataFrame, y: pd.Series, output_path: str = ".") -> NoReturn:
    """
    Create scatter plot between each feature and the response.
        - Plot title specifies feature name
        - Plot title specifies Pearson Correlation between feature and response
        - Plot saved under given folder with file name including feature name
    Parameters
    ----------
    X : DataFrame of shape (n_samples, n_features)
        Design matrix of regression problem

    y : array-like of shape (n_samples, )
        Response vector to evaluate against

    output_path: str (default ".")
        Path to folder in which plots are saved
    """
    raise NotImplementedError()


In [128]:


if __name__ == '__main__':
    np.random.seed(0)
    # Question 1 - Load and preprocessing of housing prices dataset
    df, responses = load_data("/Users/benlahav/Desktop/iml/IML.HUJI/datasets/house_prices.csv") # TODO: legit to sumbit with this?


    # Question 2 - Feature evaluation with respect to response

#     # Question 3 - Split samples into training- and testing sets.
#     raise NotImplementedError()

#     # Question 4 - Fit model over increasing percentages of the overall training data
#     # For every percentage p in 10%, 11%, ..., 100%, repeat the following 10 times:
#     #   1) Sample p% of the overall training data
#     #   2) Fit linear model (including intercept) over sampled set
#     #   3) Test fitted model over test set
#     #   4) Store average and variance of loss over test set
#     # Then plot average loss as function of training size with error ribbon of size (mean-2*std, mean+2*std)
#     raise NotImplementedError()




In [129]:


# np.unique([d[:4] for d in df.date.values if isinstance(d,str)])
# df["view"].value_counts()
# np.unique([str(z)[0:2] for z in df["lat"]], return_counts=True)
# df
# yr_built:
# yr_renovated: change to renovated in last 20y = True or something
# ZIP Codes are numbered with the first digit representing a certain group of U.S. states, the second and third digits together representing a region in that group (or perhaps a large city) and the fourth and fifth digits representing a group of delivery addresses within that region
# sqft_living15? sqft_lot15?
# change years to start counting from 1?
# TODO: fill nans with avg?

"""
https://www.kaggle.com/datasets/harlfoxem/housesalesprediction/discussion/23194
sqft_living, the total house square footage of the house
sqft_basement, size of the basement
sqft_above = sqft_living - sqft_basement
sqft_lot, lot size of the house
sqft_living15, the average house square footage of the 15 closest houses
sqft_lot15, the average lot square footage of the 15 closest houses
"""

df


Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15,sqft_living_ratio,sqft_lot_ratio,region_1,region_n
0,3,1.00,1180,5650,1.0,0,0.0,3.0,7.0,1180.0,0.0,1955.0,0.0,1340.0,5650.0,0.880597,1.000000,1,0
1,3,2.25,2570,7242,2.0,0,0.0,3.0,7.0,2170.0,400.0,1951.0,1991.0,1690.0,7639.0,1.520710,0.948030,1,0
2,2,1.00,770,10000,1.0,0,0.0,3.0,6.0,770.0,0.0,1933.0,0.0,2720.0,8062.0,0.283088,1.240387,0,0
3,4,3.00,1960,5000,1.0,0,0.0,5.0,7.0,1050.0,910.0,1965.0,0.0,1360.0,5000.0,1.441176,1.000000,1,0
4,3,2.00,1680,8080,1.0,0,0.0,3.0,8.0,1680.0,0.0,1987.0,0.0,1800.0,7503.0,0.933333,1.076903,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21611,3,2.50,1530,1131,3.0,0,0.0,3.0,8.0,1530.0,0.0,2009.0,0.0,1530.0,1509.0,1.000000,0.749503,1,0
21612,4,2.50,2310,5813,2.0,0,0.0,3.0,8.0,2310.0,0.0,2014.0,0.0,1830.0,7200.0,1.262295,0.807361,1,0
21613,2,0.75,1020,1350,2.0,0,0.0,3.0,7.0,1020.0,0.0,2009.0,0.0,1020.0,2007.0,1.000000,0.672646,1,0
21614,3,2.50,1600,2388,2.0,0,0.0,3.0,8.0,1600.0,0.0,2004.0,0.0,1410.0,1287.0,1.134752,1.855478,0,0
