In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_curve
import matplotlib.pyplot as plt

from sklearn import utils

In [2]:
df_properties = pd.read_csv('./properties_2017.csv', low_memory=False)
df_properties.shape

(2985217, 58)

In [3]:
df_train =  pd.read_csv('./train_2017.csv')
df_train.shape

(77613, 3)

In [4]:
df_properties.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [5]:
df_train.head()

Unnamed: 0,parcelid,logerror,transactiondate
0,14297519,0.025595,2017-01-01
1,17052889,0.055619,2017-01-01
2,14186244,0.005383,2017-01-01
3,12177905,-0.10341,2017-01-01
4,10887214,0.00694,2017-01-01


In [6]:
# Since theres a large amount of differences in entries
# We merge it, to reduce the amount of entries we don't need
joined_data_set = pd.merge(df_properties, df_train)
joined_data_set.shape

(77613, 60)

In [7]:
# Now we take the data apart so we have usable data
X = joined_data_set[df_properties.keys()]
print(X.shape)

y = joined_data_set[df_train.keys()]
print(y.shape)

(77613, 58)
(77613, 3)


In [8]:
# Now we remove data thats hard to work with
y = y.drop('transactiondate', 1)
y = y.drop('parcelid', 1)

# Replacing NaN for all
X = X.fillna(0)

X = X.drop('hashottuborspa', 1)
X = X.drop('propertycountylandusecode', 1)
X = X.drop('propertyzoningdesc', 1)
X = X.drop('fireplaceflag', 1)
X = X.drop('taxdelinquencyflag', 1)
X = X.drop('parcelid', 1)

print(X.shape)
print(y.shape)

# And scale
X = preprocessing.scale(X)

(77613, 52)
(77613, 1)




In [9]:
# Splitting into Testing and Training Data
testSize = 0.25
randomState = 100
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=randomState)

In [10]:
# Defining Regression Methods
my_linreg = linear_model.LinearRegression()
my_ridge = linear_model.Ridge(alpha=.5)
my_ridge_cv = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
my_lassolars = linear_model.LassoLars(alpha=.1)
my_lassolars_cv = linear_model.LassoLarsCV(cv=10)

In [11]:
# Training Models
my_linreg.fit(X_train, y_train)
my_ridge.fit(X_train, y_train)
my_ridge_cv.fit(X_train, y_train)
my_lassolars.fit(X_train, y_train)
my_lassolars_cv.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [12]:
my_linreg_pred = my_linreg.predict(X_test)
my_ridge_pred = my_ridge.predict(X_test)
my_ridge_cv_pred = my_ridge_cv.predict(X_test)
my_lassolars_pred = my_lassolars.predict(X_test)
my_lassolars_cv_pred = my_lassolars_cv.predict(X_test)

In [13]:
# Calculating RMSE
mse_linreg = metrics.mean_squared_error(y_test, my_linreg_pred)
mse_ridge = metrics.mean_squared_error(y_test, my_ridge_pred)
mse_ridge_cv = metrics.mean_squared_error(y_test, my_ridge_cv_pred)
mse_lassolars = metrics.mean_squared_error(y_test, my_lassolars_pred)
mse_lassolars_cv = metrics.mean_squared_error(y_test, my_lassolars_cv_pred)

rmse_linreg = np.sqrt(mse_linreg)
rmse_ridge = np.sqrt(mse_ridge)
rmse_ridge_cv = np.sqrt(mse_ridge_cv)
rmse_lassolars = np.sqrt(mse_lassolars)
rmse_lassolars_cv = np.sqrt(mse_lassolars_cv)

print("RSME Value Using Liear Regression: ", rmse_linreg)
print("RSME Value Using Ridge Regression: ", rmse_ridge)
print("RSME Value Using Ridge Regression Cross Validation: ", rmse_ridge_cv)
print("RSME Value Using Lasso Lars: ", rmse_lassolars)
print("RSME Value Using Lasso Lars Cross Validation: ", rmse_lassolars_cv)

RSME Value Using Liear Regression:  0.17662309017944727
RSME Value Using Ridge Regression:  0.17661671553509561
RSME Value Using Ridge Regression Cross Validation:  0.17623084579531348
RSME Value Using Lasso Lars:  0.17642723889508136
RSME Value Using Lasso Lars Cross Validation:  0.17620702919334602


In [14]:
# What happens if we use the 2016 data?
df_properties = pd.read_csv('./properties_2016.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2016_v2.csv')
print(df_train.shape)

(2985217, 58)
(90275, 3)


In [15]:
# Going through the same procedures
joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_2 = joined_data_set[df_properties.keys()]
print(X_2.shape)

y_2 = joined_data_set[df_train.keys()]
print(y_2.shape)

# Now we remove data thats hard to work with
y_2 = y_2.drop('transactiondate', 1)
y_2 = y_2.drop('parcelid', 1)

# Replacing NaN for all
X_2 = X_2.fillna(0)

X_2 = X_2.drop('hashottuborspa', 1)
X_2 = X_2.drop('propertycountylandusecode', 1)
X_2 = X_2.drop('propertyzoningdesc', 1)
X_2 = X_2.drop('fireplaceflag', 1)
X_2 = X_2.drop('taxdelinquencyflag', 1)
X_2 = X_2.drop('parcelid', 1)

print(X_2.shape)
print(y_2.shape)

# And scale
X_2 = preprocessing.scale(X_2)

(90275, 60)
(90275, 58)
(90275, 3)
(90275, 52)
(90275, 1)




In [16]:
testSize = 0.25
randomState = 100
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=testSize, random_state=randomState)

In [17]:
my_linreg.fit(X_train_2, y_train_2)
my_ridge.fit(X_train_2, y_train_2)
my_ridge_cv.fit(X_train_2, y_train_2)
my_lassolars.fit(X_train_2, y_train_2)
my_lassolars_cv.fit(X_train_2, y_train_2)

  y = column_or_1d(y, warn=True)


LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [18]:
my_linreg_pred = my_linreg.predict(X_test_2)
my_ridge_pred = my_ridge.predict(X_test_2)
my_ridge_cv_pred = my_ridge_cv.predict(X_test_2)
my_lassolars_pred = my_lassolars.predict(X_test_2)
my_lassolars_cv_pred = my_lassolars_cv.predict(X_test_2)

In [19]:
mse_linreg_2 = metrics.mean_squared_error(y_test_2, my_linreg_pred)
mse_ridge_2 = metrics.mean_squared_error(y_test_2, my_ridge_pred)
mse_ridge_cv_2 = metrics.mean_squared_error(y_test_2, my_ridge_cv_pred)
mse_lassolars_2 = metrics.mean_squared_error(y_test_2, my_lassolars_pred)
mse_lassolars_cv_2 = metrics.mean_squared_error(y_test_2, my_lassolars_cv_pred)

rmse_linreg_2 = np.sqrt(mse_linreg_2)
rmse_ridge_2 = np.sqrt(mse_ridge_2)
rmse_ridge_cv_2 = np.sqrt(mse_ridge_cv_2)
rmse_lassolars_2 = np.sqrt(mse_lassolars_2)
rmse_lassolars_cv_2 = np.sqrt(mse_lassolars_cv_2)

print("RSME Value Using Liear Regression: ", rmse_linreg_2)
print("RSME Value Using Ridge Regression: ", rmse_ridge_2)
print("RSME Value Using Ridge Regression Cross Validation: ", rmse_ridge_cv_2)
print("RSME Value Using Lasso Lars: ", rmse_lassolars_2)
print("RSME Value Using Lasso Lars Cross Validation: ", rmse_lassolars_cv_2)

RSME Value Using Liear Regression:  0.16545118543063186
RSME Value Using Ridge Regression:  0.1654522451965398
RSME Value Using Ridge Regression Cross Validation:  0.16537956889615604
RSME Value Using Lasso Lars:  0.16574364014185616
RSME Value Using Lasso Lars Cross Validation:  0.1653608307445429


In [20]:
# Now lets merge the data and see what happens
X_test_c = np.concatenate((X_test, X_test_2))
X_train_c = np.concatenate((X_train, X_train_2))
y_test_c = np.concatenate((y_test, y_test_2))
y_train_c = np.concatenate((y_train, y_train_2))

In [21]:
my_linreg.fit(X_train_c, y_train_c)
my_ridge.fit(X_train_c, y_train_c)
my_ridge_cv.fit(X_train_c, y_train_c)
my_lassolars.fit(X_train_c, y_train_c)
my_lassolars_cv.fit(X_train_c, y_train_c)

  y = column_or_1d(y, warn=True)


LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [22]:
my_linreg_pred = my_linreg.predict(X_test_c)
my_ridge_pred = my_ridge.predict(X_test_c)
my_ridge_cv_pred = my_ridge_cv.predict(X_test_c)
my_lassolars_pred = my_lassolars.predict(X_test_c)
my_lassolars_cv_pred = my_lassolars_cv.predict(X_test_c)

In [23]:
mse_linreg_c = metrics.mean_squared_error(y_test_c, my_linreg_pred)
mse_ridge_c = metrics.mean_squared_error(y_test_c, my_ridge_pred)
mse_ridge_cv_c = metrics.mean_squared_error(y_test_c, my_ridge_cv_pred)
mse_lassolars_c = metrics.mean_squared_error(y_test_c, my_lassolars_pred)
mse_lassolars_cv_c = metrics.mean_squared_error(y_test_c, my_lassolars_cv_pred)

rmse_linreg_c = np.sqrt(mse_linreg_c)
rmse_ridge_c = np.sqrt(mse_ridge_c)
rmse_ridge_cv_c = np.sqrt(mse_ridge_cv_c)
rmse_lassolars_c = np.sqrt(mse_lassolars_c)
rmse_lassolars_cv_c = np.sqrt(mse_lassolars_cv_c)

print("RSME Value Using Liear Regression: ", rmse_linreg_c)
print("RSME Value Using Ridge Regression: ", rmse_ridge_c)
print("RSME Value Using Ridge Regression Cross Validation: ", rmse_ridge_cv_c)
print("RSME Value Using Lasso Lars: ", rmse_lassolars_c)
print("RSME Value Using Lasso Lars Cross Validation: ", rmse_lassolars_cv_c)

RSME Value Using Liear Regression:  0.17056420411775808
RSME Value Using Ridge Regression:  0.1705647371245366
RSME Value Using Ridge Regression Cross Validation:  0.17053042956242923
RSME Value Using Lasso Lars:  0.17079688665256818
RSME Value Using Lasso Lars Cross Validation:  0.17052868584471745


In [24]:
#now with the merged data lets take out ouliers with a zscore greater than 3
#load all data again
df_properties = pd.read_csv('./properties_2017.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2017.csv')
print(df_train.shape)

joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_z = joined_data_set[df_properties.keys()]
print(X_z.shape)

y_z = joined_data_set[df_train.keys()]
print(y_z.shape)

(2985217, 58)
(77613, 3)
(77613, 60)
(77613, 58)
(77613, 3)


In [25]:
df_properties = pd.read_csv('./properties_2016.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2016_v2.csv')
print(df_train.shape)

joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_z_temp = joined_data_set[df_properties.keys()]
print(X_z_temp.shape)

y_z_temp = joined_data_set[df_train.keys()]
print(y_z_temp.shape)

(2985217, 58)
(90275, 3)
(90275, 60)
(90275, 58)
(90275, 3)


In [26]:
X_z = X_z.append(X_z_temp)
X_z = X_z.fillna(0)
y_z = y_z.append(y_z_temp)
print(X_z.shape)
print(y_z.shape)

(167888, 58)
(167888, 3)


In [27]:
# Now we remove data thats hard to work with
y_z = y_z.drop('transactiondate', 1)


X_z = X_z.drop('hashottuborspa', 1)
X_z = X_z.drop('propertycountylandusecode', 1)
X_z = X_z.drop('propertyzoningdesc', 1)
X_z = X_z.drop('fireplaceflag', 1)
X_z = X_z.drop('taxdelinquencyflag', 1)

print(X_z.shape)
print(y_z.shape)

(167888, 53)
(167888, 2)


In [28]:
from scipy import stats
X_keys = X_z.keys()
y_keys = y_z.keys()

print(X_z.shape)
z = np.abs(stats.zscore(X_z))
print(z)
X_z = X_z[(z < 3).all(axis=1)]
print(X_z.shape)

(167888, 53)
[[ 1.34600081  0.30880747  0.04954193 ...  0.51085951  0.1667827
   0.20751942]
 [ 1.34625345  0.30880747  0.04954193 ...  0.06225182  0.1667827
   0.20751943]
 [ 1.3503535   0.30880747  0.04954193 ...  0.41591069  0.1667827
   0.2075201 ]
 ...
 [ 0.3266214   0.22244643  0.04954193 ...  0.12717605  0.1667827
   0.04376241]
 [ 0.43618012  0.30880747  0.04954193 ...  0.90380087  0.1667827
  13.35045316]
 [ 0.54416992  0.30880747  0.04954193 ...  0.07177166  0.1667827
  13.35045316]]
(127426, 53)


In [29]:
joined_data_set = pd.merge(X_z, y_z)
X_z = joined_data_set[X_keys]
print(X_z.shape)
y_z = joined_data_set[y_keys]
print(y_z.shape)

y_z = y_z.drop('parcelid', 1)
X_z = X_z.drop('parcelid', 1)

X_z = preprocessing.scale(X_z)

(131447, 53)
(131447, 2)




In [30]:
testSize = 0.25
randomState = 100
X_train_z, X_test_z, y_train_z, y_test_z = train_test_split(X_z, y_z, test_size=testSize, random_state=randomState)

In [31]:
my_linreg.fit(X_train_z, y_train_z)
my_ridge.fit(X_train_z, y_train_z)
my_ridge_cv.fit(X_train_z, y_train_z)
my_lassolars.fit(X_train_z, y_train_z)
my_lassolars_cv.fit(X_train_z, y_train_z)

  y = column_or_1d(y, warn=True)




LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [32]:
my_linreg_pred = my_linreg.predict(X_test_z)
my_ridge_pred = my_ridge.predict(X_test_z)
my_ridge_cv_pred = my_ridge_cv.predict(X_test_z)
my_lassolars_pred = my_lassolars.predict(X_test_z)
my_lassolars_cv_pred = my_lassolars_cv.predict(X_test_z)

In [33]:
mse_linreg_z = metrics.mean_squared_error(y_test_z, my_linreg_pred)
mse_ridge_z = metrics.mean_squared_error(y_test_z, my_ridge_pred)
mse_ridge_cv_z = metrics.mean_squared_error(y_test_z, my_ridge_cv_pred)
mse_lassolars_z = metrics.mean_squared_error(y_test_z, my_lassolars_pred)
mse_lassolars_cv_z = metrics.mean_squared_error(y_test_z, my_lassolars_cv_pred)

rmse_linreg_z = np.sqrt(mse_linreg_z)
rmse_ridge_z = np.sqrt(mse_ridge_z)
rmse_ridge_cv_z = np.sqrt(mse_ridge_cv_z)
rmse_lassolars_z = np.sqrt(mse_lassolars_z)
rmse_lassolars_cv_z = np.sqrt(mse_lassolars_cv_z)

print("RSME Value Using Liear Regression: ", rmse_linreg_z)
print("RSME Value Using Ridge Regression: ", rmse_ridge_z)
print("RSME Value Using Ridge Regression Cross Validation: ", rmse_ridge_cv_z)
print("RSME Value Using Lasso Lars: ", rmse_lassolars_z)
print("RSME Value Using Lasso Lars Cross Validation: ", rmse_lassolars_cv_z)

RSME Value Using Liear Regression:  29460451433.668846
RSME Value Using Ridge Regression:  0.1615783544013458
RSME Value Using Ridge Regression Cross Validation:  0.16159120033299065
RSME Value Using Lasso Lars:  0.16202534780364442
RSME Value Using Lasso Lars Cross Validation:  0.16158560181522003


In [34]:
# Removing < 30% & doing z score
df_properties = pd.read_csv('./properties_2017.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2017.csv')
print(df_train.shape)

joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_z = joined_data_set[df_properties.keys()]
print(X_z.shape)

y_z = joined_data_set[df_train.keys()]
print(y_z.shape)

df_properties = pd.read_csv('./properties_2016.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2016_v2.csv')
print(df_train.shape)

joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_z_temp = joined_data_set[df_properties.keys()]
print(X_z_temp.shape)

y_z_temp = joined_data_set[df_train.keys()]
print(y_z_temp.shape)

X_z = X_z.append(X_z_temp)
X_z = X_z.fillna(0)
y_z = y_z.append(y_z_temp)
print(X_z.shape)
print(y_z.shape)

y_z = y_z.drop('transactiondate', 1)

X_z = X_z.drop('hashottuborspa', 1)
X_z = X_z.drop('propertycountylandusecode', 1)
X_z = X_z.drop('propertyzoningdesc', 1)
X_z = X_z.drop('fireplaceflag', 1)
X_z = X_z.drop('taxdelinquencyflag', 1)

print(X_z.shape)
print(y_z.shape)

(2985217, 58)
(77613, 3)
(77613, 60)
(77613, 58)
(77613, 3)
(2985217, 58)
(90275, 3)
(90275, 60)
(90275, 58)
(90275, 3)
(167888, 58)
(167888, 3)
(167888, 53)
(167888, 2)


In [35]:
# Lets try that again, but with the columns with the most 0's removed
# to do this we are removing columns that have less than 30% valid entries
# as dictated on the kaggle website

X_z = X_z.drop('airconditioningtypeid', 1)
X_z = X_z.drop('architecturalstyletypeid', 1)
X_z = X_z.drop('basementsqft', 1)
X_z = X_z.drop('buildingclasstypeid', 1)
X_z = X_z.drop('decktypeid', 1)
X_z = X_z.drop('finishedfloor1squarefeet', 1)
X_z = X_z.drop('finishedsquarefeet13', 1)
X_z = X_z.drop('finishedsquarefeet15', 1)
X_z = X_z.drop('finishedsquarefeet50', 1)
X_z = X_z.drop('finishedsquarefeet6', 1)
X_z = X_z.drop('fireplacecnt', 1)
X_z = X_z.drop('poolcnt', 1)
X_z = X_z.drop('poolsizesum', 1)
X_z = X_z.drop('pooltypeid10', 1)
X_z = X_z.drop('pooltypeid2', 1)
X_z = X_z.drop('pooltypeid7', 1)
X_z = X_z.drop('storytypeid', 1)
X_z = X_z.drop('threequarterbathnbr', 1)
X_z = X_z.drop('typeconstructiontypeid', 1)
X_z = X_z.drop('yardbuildingsqft17', 1)
X_z = X_z.drop('yardbuildingsqft26', 1)
X_z = X_z.drop('numberofstories', 1)
X_z = X_z.drop('taxdelinquencyyear', 1)

In [36]:
X_keys = X_z.keys()
y_keys = y_z.keys()

print(X_z.shape)
z = np.abs(stats.zscore(X_z))
print(z)
X_z = X_z[(z < 3).all(axis=1)]
print(X_z.shape)

(167888, 30)
[[1.34600081e+00 2.70878297e+00 8.33929950e-01 ... 1.84251290e-01
  5.10859510e-01 2.07519425e-01]
 [1.34625345e+00 2.87439603e-01 3.57641279e-02 ... 3.70166726e-02
  6.22518246e-02 2.07519430e-01]
 [1.35035350e+00 7.86810032e-01 3.57641279e-02 ... 4.38997043e-01
  4.15910694e-01 2.07520101e-01]
 ...
 [3.26621404e-01 2.87439603e-01 9.05458205e-01 ... 4.89928423e-03
  1.27176055e-01 4.37624114e-02]
 [4.36180116e-01 2.11930825e-01 3.57641279e-02 ... 1.20625388e+00
  9.03800872e-01 1.33504532e+01]
 [5.44169923e-01 2.11930825e-01 3.57641279e-02 ... 1.22479642e-01
  7.17716609e-02 1.33504532e+01]]
(149522, 30)


In [37]:
joined_data_set = pd.merge(X_z, y_z)
X_z = joined_data_set[X_keys]
print(X_z.shape)
y_z = joined_data_set[y_keys]
print(y_z.shape)

y_z = y_z.drop('parcelid', 1)
X_z = X_z.drop('parcelid', 1)

X_z = preprocessing.scale(X_z)

(154248, 30)
(154248, 2)




In [38]:
testSize = 0.25
randomState = 100
X_train_z, X_test_z, y_train_z, y_test_z = train_test_split(X_z, y_z, test_size=testSize, random_state=randomState)

In [39]:
my_linreg.fit(X_train_z, y_train_z)
my_ridge.fit(X_train_z, y_train_z)
my_ridge_cv.fit(X_train_z, y_train_z)
my_lassolars.fit(X_train_z, y_train_z)
my_lassolars_cv.fit(X_train_z, y_train_z)

  y = column_or_1d(y, warn=True)


LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [40]:
my_linreg_pred = my_linreg.predict(X_test_z)
my_ridge_pred = my_ridge.predict(X_test_z)
my_ridge_cv_pred = my_ridge_cv.predict(X_test_z)
my_lassolars_pred = my_lassolars.predict(X_test_z)
my_lassolars_cv_pred = my_lassolars_cv.predict(X_test_z)

In [41]:
mse_linreg_z_c = metrics.mean_squared_error(y_test_z, my_linreg_pred)
mse_ridge_z_c = metrics.mean_squared_error(y_test_z, my_ridge_pred)
mse_ridge_cv_z_c = metrics.mean_squared_error(y_test_z, my_ridge_cv_pred)
mse_lassolars_z_c = metrics.mean_squared_error(y_test_z, my_lassolars_pred)
mse_lassolars_cv_z_c = metrics.mean_squared_error(y_test_z, my_lassolars_cv_pred)

rmse_linreg_z_c = np.sqrt(mse_linreg_z_c)
rmse_ridge_z_c = np.sqrt(mse_ridge_z_c)
rmse_ridge_cv_z_c = np.sqrt(mse_ridge_cv_z_c)
rmse_lassolars_z_c = np.sqrt(mse_lassolars_z_c)
rmse_lassolars_cv_z_c = np.sqrt(mse_lassolars_cv_z_c)

print("RSME Value Using Liear Regression: ", rmse_linreg_z_c)
print("RSME Value Using Ridge Regression: ", rmse_ridge_z_c)
print("RSME Value Using Ridge Regression Cross Validation: ", rmse_ridge_cv_z_c)
print("RSME Value Using Lasso Lars: ", rmse_lassolars_z_c)
print("RSME Value Using Lasso Lars Cross Validation: ", rmse_lassolars_cv_z_c)

RSME Value Using Liear Regression:  0.15722339192212734
RSME Value Using Ridge Regression:  0.15722185741150327
RSME Value Using Ridge Regression Cross Validation:  0.1572053480606298
RSME Value Using Lasso Lars:  0.15752374665604352
RSME Value Using Lasso Lars Cross Validation:  0.15719744554080614


In [42]:
# Removing less than 30% and no zscore
df_properties = pd.read_csv('./properties_2017.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2017.csv')
print(df_train.shape)

joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_z = joined_data_set[df_properties.keys()]
print(X_z.shape)

y_z = joined_data_set[df_train.keys()]
print(y_z.shape)

df_properties = pd.read_csv('./properties_2016.csv', low_memory=False)
print(df_properties.shape)

df_train =  pd.read_csv('./train_2016_v2.csv')
print(df_train.shape)

joined_data_set = pd.merge(df_properties, df_train)
print(joined_data_set.shape)

X_z_temp = joined_data_set[df_properties.keys()]
print(X_z_temp.shape)

y_z_temp = joined_data_set[df_train.keys()]
print(y_z_temp.shape)

X_z = X_z.append(X_z_temp)
X_z = X_z.fillna(0)
y_z = y_z.append(y_z_temp)
print(X_z.shape)
print(y_z.shape)

y_z = y_z.drop('transactiondate', 1)

X_z = X_z.drop('hashottuborspa', 1)
X_z = X_z.drop('propertycountylandusecode', 1)
X_z = X_z.drop('propertyzoningdesc', 1)
X_z = X_z.drop('fireplaceflag', 1)
X_z = X_z.drop('taxdelinquencyflag', 1)
X_z = X_z.drop('airconditioningtypeid', 1)
X_z = X_z.drop('architecturalstyletypeid', 1)
X_z = X_z.drop('basementsqft', 1)
X_z = X_z.drop('buildingclasstypeid', 1)
X_z = X_z.drop('decktypeid', 1)
X_z = X_z.drop('finishedfloor1squarefeet', 1)
X_z = X_z.drop('finishedsquarefeet13', 1)
X_z = X_z.drop('finishedsquarefeet15', 1)
X_z = X_z.drop('finishedsquarefeet50', 1)
X_z = X_z.drop('finishedsquarefeet6', 1)
X_z = X_z.drop('fireplacecnt', 1)
X_z = X_z.drop('poolcnt', 1)
X_z = X_z.drop('poolsizesum', 1)
X_z = X_z.drop('pooltypeid10', 1)
X_z = X_z.drop('pooltypeid2', 1)
X_z = X_z.drop('pooltypeid7', 1)
X_z = X_z.drop('storytypeid', 1)
X_z = X_z.drop('threequarterbathnbr', 1)
X_z = X_z.drop('typeconstructiontypeid', 1)
X_z = X_z.drop('yardbuildingsqft17', 1)
X_z = X_z.drop('yardbuildingsqft26', 1)
X_z = X_z.drop('numberofstories', 1)
X_z = X_z.drop('taxdelinquencyyear', 1)

print(X_z.shape)
print(y_z.shape)

(2985217, 58)
(77613, 3)
(77613, 60)
(77613, 58)
(77613, 3)
(2985217, 58)
(90275, 3)
(90275, 60)
(90275, 58)
(90275, 3)
(167888, 58)
(167888, 3)
(167888, 30)
(167888, 2)


In [43]:
joined_data_set = pd.merge(X_z, y_z)
X_z = joined_data_set[X_keys]
print(X_z.shape)
y_z = joined_data_set[y_keys]
print(y_z.shape)

y_z = y_z.drop('parcelid', 1)
X_z = X_z.drop('parcelid', 1)

X_z = preprocessing.scale(X_z)

(173262, 30)
(173262, 2)




In [44]:
testSize = 0.25
randomState = 100
X_train_z, X_test_z, y_train_z, y_test_z = train_test_split(X_z, y_z, test_size=testSize, random_state=randomState)

In [45]:
my_linreg.fit(X_train_z, y_train_z)
my_ridge.fit(X_train_z, y_train_z)
my_ridge_cv.fit(X_train_z, y_train_z)
my_lassolars.fit(X_train_z, y_train_z)
my_lassolars_cv.fit(X_train_z, y_train_z)

  y = column_or_1d(y, warn=True)


LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [46]:
my_linreg_pred = my_linreg.predict(X_test_z)
my_ridge_pred = my_ridge.predict(X_test_z)
my_ridge_cv_pred = my_ridge_cv.predict(X_test_z)
my_lassolars_pred = my_lassolars.predict(X_test_z)
my_lassolars_cv_pred = my_lassolars_cv.predict(X_test_z)

In [47]:
mse_linreg_z_c = metrics.mean_squared_error(y_test_z, my_linreg_pred)
mse_ridge_z_c = metrics.mean_squared_error(y_test_z, my_ridge_pred)
mse_ridge_cv_z_c = metrics.mean_squared_error(y_test_z, my_ridge_cv_pred)
mse_lassolars_z_c = metrics.mean_squared_error(y_test_z, my_lassolars_pred)
mse_lassolars_cv_z_c = metrics.mean_squared_error(y_test_z, my_lassolars_cv_pred)

rmse_linreg_z_c = np.sqrt(mse_linreg_z_c)
rmse_ridge_z_c = np.sqrt(mse_ridge_z_c)
rmse_ridge_cv_z_c = np.sqrt(mse_ridge_cv_z_c)
rmse_lassolars_z_c = np.sqrt(mse_lassolars_z_c)
rmse_lassolars_cv_z_c = np.sqrt(mse_lassolars_cv_z_c)

print("RSME Value Using Liear Regression: ", rmse_linreg_z_c)
print("RSME Value Using Ridge Regression: ", rmse_ridge_z_c)
print("RSME Value Using Ridge Regression Cross Validation: ", rmse_ridge_cv_z_c)
print("RSME Value Using Lasso Lars: ", rmse_lassolars_z_c)
print("RSME Value Using Lasso Lars Cross Validation: ", rmse_lassolars_cv_z_c)

RSME Value Using Liear Regression:  0.16516719500418073
RSME Value Using Ridge Regression:  0.16516484596171407
RSME Value Using Ridge Regression Cross Validation:  0.16517446983538486
RSME Value Using Lasso Lars:  0.1654748556924704
RSME Value Using Lasso Lars Cross Validation:  0.16517835468886924
