# Final Project

## Model Fitting on the Ames, Iowa Dataset

3. Use a cross-validated grid search to refine three of the above models.

In [1]:
run ../src/load_data.py

In [2]:
# import models
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# import ML tools
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
from time import time

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

matplotlib.rcParams.update({'font.size': 14})
%matplotlib inline

### Split training and test sets and perform feature engineering on each set separately 

In [3]:
labels = housing_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(housing_df, labels, random_state=42)
X_train = process_na_vals(X_train)
X_test = process_na_vals(X_test)
y_train = X_train['SalePrice']
X_train.drop(['SalePrice'], axis=1, inplace=True)
y_test = X_test['SalePrice']
X_test.drop(['SalePrice'], axis=1, inplace=True)

In [4]:
alpha = [0.1**i for i in range(1, 20)]
alpha.extend(range(1, 500, 20))
models_params = {
    'Ridge': {'alpha': alpha},
    'Lasso': {'alpha': alpha},
    'KNN': {'n_neighbors': range(1,10)},
    'DecisionTree' : {'max_depth': [1,5]},
    'LinearSVM' : {'C': range(1,5)},
}

In [5]:
def run_gridsearch(model, model_name, X_train, y_train, X_test, y_test):
    _ = model.fit(X_train, y_train)
    reg_params = models_params[model_name]
    model_gs = GridSearchCV(model, 
                      param_grid= reg_params,
                      cv=5,
                      return_train_score=True)
    model_gs.fit(X_train, y_train)
    return {
        'model_name' : model_name,
        'model_best_params' : model_gs.best_params_,
        'model_train_score' : model_gs.best_score_,
        'model_test_score' : model_gs.score(X_test, y_test)
    }


In [6]:
test_results = {}
models = {
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'KNN' : KNeighborsRegressor(),
    'DecisionTree' : DecisionTreeRegressor(),
    'LinearSVM' : SVR(kernel ='linear')
}
model_names = models.keys()
for name in tqdm(model_names):
    test_results[name] = run_gridsearch(models[name], name,
                                        X_train, y_train,
                                        X_test, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [17:48<00:00, 213.76s/it]


In [7]:
test_results = pd.DataFrame(test_results).T
test_results

Unnamed: 0,model_best_params,model_name,model_test_score,model_train_score
DecisionTree,{'max_depth': 5},DecisionTree,0.793846,0.650096
KNN,{'n_neighbors': 5},KNN,0.702551,0.627463
Lasso,{'alpha': 141},Lasso,0.900211,0.736864
LinearSVM,{'C': 4},LinearSVM,0.804762,0.681199
Ridge,{'alpha': 0.00010000000000000002},Ridge,0.72533,0.762105
