In [None]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.base 

import patsy

sys.path.insert(1, os.path.join(sys.path[0], '..'))
import samlib

# Load data
Generated in notebook ``data_exploration_numerical_features.ipynb``

In [None]:
dfnum = pd.read_csv('transformed_numerical_dataset_imputed.csv', index_col=['Dataset','Id'])

In [None]:
dfnum.head()

In [None]:
dfnum.columns[:-1]

In [None]:
dfnum = dfnum[['SalePrice'] + list(dfnum.columns[:-1])]

In [None]:
dfcat = pd.read_csv('cleaned_categorical_vars_with_colz_sorted_by_goodness.csv', index_col=['Dataset','Id'])

In [None]:
dfcat.head()

In [None]:
def get_dataframe(nnum=None, ncat=None):
    # Concatenate the first nnum numerical features and the first ncat categorical ones
    if nnum is None:
        nnum = dfnum.shape[1]
    if ncat is None:
        ncat = dfcat.shape[1]
    df = pd.concat([dfnum.iloc[:, :nnum], dfcat.iloc[:, :ncat]], axis=1)
    # Get design dataframes
    formula = ' + '.join(df.drop('SalePrice', axis=1))
    Xfull = patsy.dmatrix(formula, df, return_type='dataframe')
    ytrain = df.loc['train'][['SalePrice']]
    Xtrain = Xfull.loc['train']
    Xtest = Xfull.loc['test']
    return ytrain, Xtrain, Xtest

## Cross validation

In [None]:
def model(X, y):
    return smapi.OLS(y, X)

In [None]:
dfcat.shape

In [None]:
dfnum.shape

In [None]:
n1 = [dfnum.shape[1] - 1, dfnum.shape[1]]#np.arange(2, 16, 1)
n2 = np.arange(2, 16)
N1, N2 = np.meshgrid(n1, n2)

In [None]:
N1.shape, N2.shape

In [None]:
mselst = []
for ncat in n2:
    row = []
    for nnum in n1:
        ytrain, Xtrain, _ = get_dataframe(nnum=nnum, ncat=ncat)
        mse = np.sqrt(-sk.model_selection.cross_val_score(samlib.Regressor(model), 
                      Xtrain, y=ytrain,  
                      scoring='neg_mean_squared_error', cv=5)).mean()
        print(nnum, ncat, mse)
        row.append(mse)
    mselst.append(row)
msearr = np.array(mselst)

In [None]:
plt.pcolormesh(N1-1, N2, msearr)
plt.colorbar()
plt.xlabel('Number of numerical variables')
plt.ylabel('Number of categorical variables')

## Submission

In [None]:
ncat=5
ytrain, Xtrain, Xtest = get_dataframe(ncat=ncat)

As can be seen below, using more numerical values improves R-squared to 0.88 which is pretty good, though there's of course a risk of overfitting.

In [None]:
regression2 = smapi.OLS(ytrain, Xtrain).fit()
regression2.summary()

In [None]:
ytest = regression2.predict(Xtest)
ytest

In [None]:
submission = np.expm1(ytest) * 1000
submission[:10]

In [None]:
def save(filename, submission):
    df = pd.DataFrame(data={
            "Id": np.arange(len(submission)) + 1461,
            "SalePrice": submission
            })
    df.to_csv(filename, index=False)
    
save('ols_full_{}.csv'.format(ncat), submission)

## Regression interpretation
Statsmodels has special plots to explore the outcome of a regression model
http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html