# Ames Housing Project

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV,Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from scipy import stats
import math
import statsmodels.api as sm

In [7]:
train = pd.read_csv('../datasets/train_clean.csv')
test = pd.read_csv('../datasets/test_clean.csv')

In [8]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

## Modeling!

In [10]:
X = train[['TotalSF', 'Overall Qual', 'Gr Liv Area',
          'Overall Garage', 'Recently Blt/Remod', 'Bathrooms',
           'TotRms AbvGrd', 'Foundation_PConc', 'MS Zoning_RM',
          'Overall Great Fireplace', 'Exter Qual_TA', 'Heating QC_Ex',
          'Total Porch SF', 'Kitchen Qual_TA', 'Neighborhood_NridgHt',
          'Bsmt Qual_Ex']]
y = train['SalePrice']
lr = LinearRegression()
poly = PolynomialFeatures()
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, train_size=.15)
lr.fit(X_poly, y)

LinearRegression()

In [11]:
cross_val_score(lr,X_train,y_train,cv=10)

array([ 0.73764185,  0.80377019,  0.63897118,  0.77761964,  0.02197006,
       -1.02066578,  0.68077126,  0.80028514,  0.10554889,  0.47103851])

In [12]:
predictions = lr.predict(X_test)

In [13]:
np.sqrt(mean_squared_error(np.exp(y_test),np.exp(predictions)))

22024.322922318162

In [14]:
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.9000762616549725, 0.9107669509023978)

### Trying a Ridge Model

In [15]:
X_overfit = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
)
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)        # NOTE: fit transform on X_train
Z_test = sc.transform(X_test)   

In [16]:
ols = LinearRegression()
ols.fit(X_train,y_train)

LinearRegression()

In [17]:
ridge_model = Ridge(alpha = 10)
# Fit.
ridge_model.fit(Z_train, y_train)
# Evaluate model using R2 bc this is a regression task
print(ridge_model.score(Z_train, y_train))
print(ridge_model.score(Z_test,y_test))

0.9052682681113022
0.8872708775229174


In [18]:
r_alphas = np.logspace(0,5,100) #alphas range from 1 to 10^5

# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas = r_alphas,
                  scoring = 'r2',
                  cv = 5)

# Fit model using best ridge alpha!
ridge_cv.fit(Z_train,y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
        cv=5, scoring='r2')

In [19]:
ridge_model = Ridge(alpha = ridge_cv.alpha_)
# Fit.
ridge_model.fit(Z_train, y_train)
# Evaluate model using R2 b/c this is a regression task
print(ridge_model.score(Z_train, y_train))
print(ridge_model.score(Z_test, y_test))

0.9037791077319729
0.8883500997952362


In [20]:
pred = ridge_model.predict(Z_train) 
np.sqrt(mean_squared_error(np.exp(y_train),np.exp(pred)))

22782.875967009943

### Let's try a LASSO

In [21]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-5, 2, 1000)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50000, tol=0.1, selection='random', n_jobs=-1)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train);

In [22]:
lasso_pred = lasso_cv.predict(Z_train)

In [23]:
np.sqrt(mean_squared_error(np.exp(y_train),np.exp(lasso_pred)))

26899.799328385892

## Kaggle Submission!
![](https://media3.giphy.com/media/WUq1cg9K7uzHa/giphy-downsized.gif?cid=6104955ebadd78bc20ca503801d047c1b0b0a38f0d948954&rid=giphy-downsized.gif)

In [25]:
X_kaggle = test[['TotalSF', 'Overall Qual', 'Gr Liv Area',
          'Overall Garage', 'Recently Blt/Remod', 'Bathrooms',
           'TotRms AbvGrd', 'Foundation_PConc', 'MS Zoning_RM',
          'Overall Great Fireplace', 'Exter Qual_TA', 'Heating QC_Ex',
          'Total Porch SF', 'Kitchen Qual_TA', 'Neighborhood_NridgHt',
          'Bsmt Qual_Ex']]
kaggle_submission = pd.DataFrame()
kaggle_predictions = lr.predict(poly.transform(X_kaggle)) #Must apply PolyFeatures to test data.
kaggle_submission['Id'] = test['Id']
kaggle_submission['SalePrice'] = np.exp(kaggle_predictions) #Becuase SalePrice in log form
kaggle_submission.to_csv('../datasets/kaggle_submission_1.csv',index=False)