In [None]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.base 
import sklearn.decomposition


sys.path.insert(1, os.path.join(sys.path[0], '..'))
import samlib

# Load numerical data
Generated in notebook ``data_exploration_numerical_features.ipynb``

In [None]:
dfnum_t2 = pd.read_csv('transformed_numerical_dataset_imputed.csv', index_col=['Dataset','Id'])

In [None]:
dfnum_t2.head()

In [None]:
dfnum_t2.tail()

# Explore colinearity

In [None]:
import patsy


In [None]:
def get_pca_dmatrices(dfnum_t2, ncols):
    df = dfnum_t2.drop('SalePrice', axis=1).copy()
    # Decompose input dataframe using pca
    pca = sklearn.decomposition.PCA(ncols)
    pca.fit(df)
    dg = pd.DataFrame(data=pca.transform(df), 
                  columns=['PCA{}'.format(i) for i in range(ncols)])
    dg.index = dfnum_t2.index
    # Create model
    formula = ' + '.join(dg)
    Xfull = patsy.dmatrix(formula, dg, return_type='dataframe')
    Xtrain = Xfull.loc['train']
    Xtest = Xfull.loc['test']
    ytrain = dfnum_t2['SalePrice'].loc['train']
    return ytrain, Xtrain, Xtest


In [None]:
ytrain, Xtrain, _ = get_pca_dmatrices(dfnum_t2, 10)

In [None]:
Xtrain.shape

In [None]:
ytrain.shape

## Cross validation

In [None]:
def ols(X, y):
    return smapi.OLS(y, X)

## Test the model 
### Use `sklearn.model_selection.train_test_split` to run some experiments and validate the models

In [None]:
def rmse(prediction, exact):
    return np.mean((prediction - exact)**2.0)**0.5

def run_experiment(estimator, scoring=rmse, ncols=21):
    yfull, Xfull, _ = get_pca_dmatrices(dfnum_t2, ncols)
    Xtrain, Xtest, ytrain, ytest = sk.model_selection.train_test_split(Xfull, yfull)
    model = estimator(Xtrain, ytrain).fit()
    return scoring(model.predict(Xtest), ytest), model.condition_number

def cross_validate(estimator, cv=5, ncols=21):
    exps = np.array([run_experiment(estimator, ncols=ncols) for _ in range(cv)])
    return exps

In [None]:
cross_validate(ols).mean(axis=0)

In [None]:
ncols_lst = list(range(1, dfnum_t2.shape[1]))
mean_errs = []
for ncols in ncols_lst:
    errors = cross_validate(ols, ncols=ncols)
    mean_errs.append(errors.mean(axis=0))

In [None]:
mean_errs = np.array(mean_errs).T

In [None]:
mean_errs[0]

Twenty features strikes a good balance between low error and low condition number.

In [None]:
fig, axes = plt.subplots(1,2, figsize=(12, 6))

axes[0].plot(ncols_lst, mean_errs[0])
plt.xlabel('Number of PCA features')
plt.ylabel('Mean error')

axes[1].plot(ncols_lst, mean_errs[1])
plt.xlabel('Number of PCA features')
plt.ylabel('Condition number')


### Use `sklearn.model_selection_cross_val_score` to validate the models
An alternative way to cross validate. More direct but not as flexible as the previous one (can't easily get the condition number). 

In [None]:
ncols_lst = list(range(1, dfnum_t2.shape[1]))
mean_errs = []
for ncols in ncols_lst:
    yfull, Xfull, _ = get_pca_dmatrices(dfnum_t2, ncols)
    mse = np.sqrt(-sk.model_selection.cross_val_score(samlib.Regressor(ols), 
                                                      Xfull, 
                                                      y=yfull,  
                                   scoring='neg_mean_squared_error', cv=10))
    print(mse.mean())

## Make a submission
Let's use 20 features.

In [None]:
ncols=20
ytrain, Xtrain, Xtest = get_pca_dmatrices(dfnum_t2, ncols)

In [None]:
model = ols(Xtrain, ytrain)

In [None]:
reg = model.fit()
submission_t = reg.predict(Xtest)

In [None]:
submission = np.expm1(submission_t) * 1000
submission.head()

In [None]:
def save(filename, submission):
    df = pd.DataFrame(data={
            "Id": np.arange(len(submission)) + 1461,
            "SalePrice": submission
            })
    df.to_csv(filename, index=False)
    
save('ols_key_numerical_features_only_imputed_pca_ncols={}.csv'.format(ncols), submission)

## Regression interpretation
Statsmodels has special plots to explore the outcome of a regression model
http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html

# Save PCA numerical features


In [None]:
df = pd.concat([Xtrain, Xtest], keys=['train', 'test'])
df.index.names = 'Dataset', 'Id'

In [None]:
df.drop('Intercept', axis=1, inplace=True)
df.loc['train','SalePrice'] = ytrain.values

In [None]:
df.head()

In [None]:
df.to_csv('transformed_numerical_dataset_imputed_pca_ncols={}.csv'.format(ncols), index=True)