<a href="https://colab.research.google.com/github/sameliason/kaggle/blob/main/kaggle_comp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Kaggle Competition ECON 484

In [None]:
# read in important packages

import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# read in the data and see how it is formatted
housing = pd.read_csv('/content/gdrive/My Drive/Econ 484/psets/train.csv')
final_test = pd.read_csv('/content/gdrive/My Drive/Econ 484/psets/test.csv')

print(housing.shape)
print(final_test.shape)
housing
print(housing.head())
print("Shape: {}".format(str(housing.shape)))

(1460, 81)
(1459, 80)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  

In [None]:
# Lets clean the data!!
# Create X for the training data
X = housing.drop(columns=['SalePrice'])

# get dummies from the data and add them to the main data sets using pd.concat
dummies_X = pd.get_dummies(X, drop_first = True)
# X = pd.concat([X, dummies_X], axis=1)

# fix all the na values in the dataset
X = dummies_X.fillna(0)

# add interaction terms to the model to have more x variables to work with
  # pull out float columns only
numeric_cols = X.select_dtypes(include=['float']).columns
X_float = X[numeric_cols].values

  # run the float columns to get their interactions
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_float)

  # let's stardardize our X matrix!!
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_standard = pd.DataFrame(scaler.fit_transform(X_poly))

  # concat the interactions with the main features
X_new = pd.concat([X, pd.DataFrame(X_standard)], axis=1)


X_new.columns = X_new.columns.astype(str)


In [None]:
# Do the same cleaning on the test dataset

# get dummies from the data and use them as the features

dummies_final_test = pd.get_dummies(final_test, drop_first = True)
# final_test = pd.concat([final_test, dummies_final_test], axis=1)

# fix all the na values in the dataset
final_test = dummies_final_test.fillna(0)

# add interaction terms to the model to have more x variables to work with
  # pull out float columns only
numeric_cols = final_test.select_dtypes(include=['float']).columns
final_test_float = final_test[numeric_cols].values

  # run the float columns to get their interactions
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
final_test_poly = poly.fit_transform(final_test_float)

  # let's stardardize our X matrix!!
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
final_test_standard = pd.DataFrame(scaler.fit_transform(final_test_poly))

  # concat the interactions with the main features
final_test_new = pd.concat([final_test, pd.DataFrame(final_test_standard)], axis=1)


final_test_new.columns = final_test_new.columns.astype(str)

In [None]:
# Align columns of X_new and final_test_new
X_new, final_test_new = X_new.align(final_test_new, join='outer', axis=1, fill_value=0)

In [None]:
X_new

Unnamed: 0,0,1,10,11,12,13,14,15,16,17,...,SaleType_WD,ScreenPorch,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities_NoSeWa,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0.212877,0.514104,0,0,0,0,0,0,0,0,...,1,0,1,8,856,0,0,2003,2003,2008
1,0.645747,-0.570750,0,0,0,0,0,0,0,0,...,1,0,1,6,1262,0,298,1976,1976,2007
2,0.299451,0.325915,0,0,0,0,0,0,0,0,...,1,0,1,6,920,0,0,2001,2002,2008
3,0.068587,-0.570750,0,0,0,0,0,0,0,0,...,1,0,1,7,756,0,0,1915,1970,2006
4,0.761179,1.366489,0,0,0,0,0,0,0,0,...,1,0,1,9,1145,0,192,2000,2000,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.126303,-0.570750,0,0,0,0,0,0,0,0,...,1,0,1,7,953,0,0,1999,2000,2007
1456,0.790037,0.087911,0,0,0,0,0,0,0,0,...,1,0,1,7,1542,0,349,1978,1988,2010
1457,0.241735,-0.570750,0,0,0,0,0,0,0,0,...,1,0,1,9,1152,0,0,1941,2006,2010
1458,0.299451,-0.570750,0,0,0,0,0,0,0,0,...,1,0,1,5,1078,0,366,1950,1996,2010


In [None]:
# split the training data in order to train our model and estimate a test error before running our model on the final test set

y = housing['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state=42)

In [None]:
# random forest model
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(max_depth=7, max_features='sqrt')
forest.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.906
Accuracy on test set: 0.826


In [None]:
# Let's cross validate to find the optimal tuning parameters:
# import gridsearchcv
from sklearn.model_selection import GridSearchCV

# define the parameter grid
param_grid = {'n_estimators':[50, 100, 150], 'max_depth':[2, 4, 5, 6, 7, 8, 9, 10, 12]}

# run the grid search
grid_search = GridSearchCV(forest,param_grid,cv=5,return_train_score=True)
best_model = grid_search.fit(X_train, y_train)

print("Best max depth: ",best_model.best_estimator_.get_params()['max_depth'])
print("Best number of estimators: ",best_model.best_estimator_.get_params()['n_estimators'])

print("Accuracy on training set: {:.3f}".format(best_model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(best_model.score(X_test, y_test)))

Best max depth:  12
Best number of estimators:  150
Accuracy on training set: 0.966
Accuracy on test set: 0.853


In [None]:
best_model = grid_search.fit(X_new, y)

In [None]:
final_test_new

Unnamed: 0,0,1,10,11,12,13,14,15,16,17,...,SaleType_WD,ScreenPorch,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities_NoSeWa,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0.684849,-0.563316,1.185511,-0.427068,0.278092,0.694848,-0.329125,0.141855,-0.639824,-0.217973,...,1,120,1,5,882.0,0,140,1961,1961,2010
1,0.715852,0.047057,-0.738518,0.154877,1.211177,-0.249463,-0.020504,0.837380,-0.639824,-0.217973,...,1,0,1,6,1329.0,0,393,1958,1958,2010
2,0.498831,-0.563316,0.043982,-0.427068,0.805448,-0.249463,-0.642559,0.106450,-0.639824,-0.217973,...,1,0,1,6,928.0,0,212,1997,1998,2010
3,0.622843,-0.450284,-0.011253,-0.323292,0.515994,-0.249463,-0.228712,0.173135,-0.639824,-0.217973,...,1,0,1,7,926.0,0,360,1998,1998,2010
4,-0.462261,-0.563316,0.154452,-0.427068,-0.375191,-0.249463,0.276058,-0.149190,-0.639824,-0.217973,...,1,144,1,5,1280.0,0,0,1992,1992,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-1.144325,-0.563316,-2.174636,-0.427068,-0.657919,-0.249463,-0.606244,-0.966327,-0.639824,-0.217973,...,1,0,1,5,546.0,0,0,1970,1970,2006
1455,-1.144325,-0.563316,-0.858195,-0.427068,-0.525618,-0.249463,-0.750956,-0.966327,-0.639824,-0.217973,...,1,0,1,6,546.0,0,0,1970,1970,2006
1456,3.165085,-0.563316,0.476658,-0.427068,4.238137,-0.249463,-0.919787,2.491215,3.346417,-0.217973,...,1,0,1,7,1224.0,0,474,1960,1996,2006
1457,0.126795,-0.563316,-2.174636,-0.427068,-0.135563,-0.249463,0.055079,-0.120985,-0.639824,3.686467,...,1,0,1,6,912.0,0,80,1992,1992,2006


In [None]:
# Reminder, if you did any data pre-processing to your train.csv, you have to do the exact same pre-processing to the test data before predicting your outputs.

# Predict the test set
pred = best_model.predict(final_test_new)
# Get the ID list from the sample submission
SAMPLE = pd.read_csv('/content/gdrive/My Drive/Econ 484/psets/sample_submission.csv')
ids = SAMPLE.Id
# Create the submission file
pred = pd.DataFrame(pred, index=ids, columns=['SalePrice'])
pred.to_csv('voting_submission.csv')
