In [73]:
import numpy as np
import pandas as pd
from pathlib import Path 
import pickle
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import expon

## Note: 
Split out one group as predict (our unknowns) and labeled (what we know). Build a model off labeled which includes training and testing. Then use predict for demonstration of deployed model.

In [9]:
labeled = pd.read_pickle(Path("../data/processed/caliTrain_200605"))
predict = pd.read_pickle(Path("../data/processed/caliPredict_200605"))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(labeled.iloc[:,:-1], labeled.iloc[:,-1], test_size=0.1)

In [71]:
labeled.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'target'],
      dtype='object')

In [40]:
p_dist = {'alpha': expon(scale = 0.5)}

In [41]:
ridge = Ridge()

In [42]:
inner_cv = KFold(n_splits=10, shuffle=True)
outer_cv = KFold(n_splits=10, shuffle=True)

In [67]:
estimator = RandomizedSearchCV(estimator=ridge, param_distributions=p_dist, refit=True, cv = inner_cv, n_iter=50)
estimator.fit(X_train, y_train)
nested_scored = cross_val_score(estimator, X=X_train, y=y_train, cv=outer_cv)

In [68]:
nested_scored

array([0.57996331, 0.56558741, 0.53217985, 0.53046728, 0.51643387,
       0.51773179, 0.58027208, 0.54580938, 0.53433145, 0.45777442])

In [69]:
estimator.best_params_

{'alpha': 2.434393348293477}

In [70]:
estimator.score(X_test, y_test)

0.5385592401449586

In [74]:
pickle.dump(estimator, open('../models/ridge.pkl','wb'))

In [75]:
X_test.iloc[1,:]

MedInc          4.864600
HouseAge       29.000000
AveRooms        5.150485
AveBedrms       0.980583
Population    759.000000
AveOccup        3.684466
Name: 14767, dtype: float64