In [49]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [3]:
dataset = Path('../data/raw')
dataset.exists()

True

In [5]:
data_bunch = fetch_california_housing(data_home=dataset, download_if_missing=True)

In [39]:
X = data_bunch.data
y = data_bunch.target
columns = data_bunch.feature_names

In [40]:
print(data_bunch.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [38]:
df = pd.DataFrame(X, columns = columns)
df['target'] = y

In [42]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'target'],
      dtype='object')

Note: Drop lat long due to nominal category. Could add in kNN here to create regions. 

In [44]:
df.drop(['Latitude', 'Longitude'], axis = 1, inplace = True)

In [46]:
X = df.iloc[:,:-1] 
y = df.iloc[:,-1]

In [50]:
X_train, X_predict, y_train, y_predict = train_test_split(X, y, test_size=0.1, random_state=42)

In [54]:
df_train = X_train.merge(y_train, left_index=True, right_index=True)

In [56]:
df_predict = X_predict.merge(y_predict, left_index=True, right_index=True)

In [57]:
df_train.to_pickle(Path("../data/processed/caliTrain_200605"))
df_predict.to_pickle(Path("../data/processed/caliPredict_200605"))