In [1]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [2]:
#reading in data
train = pd.read_csv('../datasets/cleaned_train.csv', keep_default_na=False, na_values=[''])
test = pd.read_csv('../datasets/cleaned_test.csv', keep_default_na=False, na_values=[''])

In [3]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,0.0,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
#genereates model preidctions of a model
def rmse(model, X, y):
    model_preds = model.predict(X)
    mse = mean_squared_error(y, model_preds)
    return mse ** 0.5

In [5]:
numeric_cols = train._get_numeric_data().columns

In [6]:
train.corr()['SalePrice']

Id                -0.051398
PID               -0.255052
MS SubClass       -0.087335
Lot Frontage       0.181456
Lot Area           0.296566
Overall Qual       0.800207
Overall Cond      -0.097019
Year Built         0.571849
Year Remod/Add     0.550370
Mas Vnr Area       0.503579
BsmtFin SF 1       0.423856
BsmtFin SF 2       0.016432
Bsmt Unf SF        0.190210
Total Bsmt SF      0.629303
1st Flr SF         0.618486
2nd Flr SF         0.248452
Low Qual Fin SF   -0.041594
Gr Liv Area        0.697038
Bsmt Full Bath     0.283332
Bsmt Half Bath    -0.045290
Full Bath          0.537969
Half Bath          0.283001
Bedroom AbvGr      0.137067
Kitchen AbvGr     -0.125444
TotRms AbvGrd      0.504014
Fireplaces         0.471093
Garage Cars        0.648220
Garage Area        0.650270
Wood Deck SF       0.326490
Open Porch SF      0.333476
Enclosed Porch    -0.135656
3Ssn Porch         0.048732
Screen Porch       0.134581
Pool Area          0.023106
Misc Val          -0.007375
Mo Sold            0

In [7]:
high_corr_cols = numeric_cols[train.corr()['SalePrice'] > .4]

In [8]:
high_corr_cols

Index(['Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area',
       'Full Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Cars',
       'Garage Area', 'SalePrice'],
      dtype='object')

In [9]:
obj_cols = [each for each in train.columns if each not in numeric_cols]
problem_cols = ['Neighborhood', 'Condition 1', 'Condition 2', 'MS Zoning']
train_dummies = pd.get_dummies(train, columns=problem_cols)
test_dummies = pd.get_dummies(test, columns=problem_cols)
missing_from_test = [col for col in train_dummies.columns if col not in test_dummies.columns]
test_dummies[missing_from_test] = 0
train_dummies.head(3)

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Condition 2_RRAe,Condition 2_RRAn,Condition 2_RRNn,MS Zoning_A (agr),MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM
0,109,533352170,60,0.0,13517,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,1,0
1,544,531379050,60,43.0,11492,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,1,0
2,153,535304180,20,68.0,7922,Pave,,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,0,1,0


In [10]:
test_dummies[train_dummies.columns].head(3)

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Condition 2_RRAe,Condition 2_RRAn,Condition 2_RRNn,MS Zoning_A (agr),MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM
0,2658,902301120,190,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,0.0,9662,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,1,0
2,2414,528218130,60,58.0,17104,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,1,0


In [11]:
train.isna().sum().sort_values()

Id               0
Kitchen AbvGr    0
Bedroom AbvGr    0
Half Bath        0
Full Bath        0
                ..
Garage Qual      1
Garage Area      1
Garage Cars      1
Garage Finish    1
Bsmt Unf SF      1
Length: 80, dtype: int64

In [12]:
train[obj_cols]

Unnamed: 0,MS Zoning,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,...,Fireplace Qu,Garage Type,Garage Finish,Garage Qual,Garage Cond,Paved Drive,Pool QC,Fence,Misc Feature,Sale Type
0,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,...,,Attchd,RFn,TA,TA,Y,,,,WD
1,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,...,TA,Attchd,RFn,TA,TA,Y,,,,WD
2,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,,Detchd,Unf,TA,TA,Y,,,,WD
3,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Timber,Norm,...,,BuiltIn,Fin,TA,TA,Y,,,,WD
4,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,...,,Detchd,Unf,TA,TA,N,,,,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,RL,Pave,,IR1,HLS,AllPub,Inside,Gtl,Timber,Norm,...,Gd,Attchd,Fin,TA,TA,Y,,,,WD
2047,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Edwards,Norm,...,,Detchd,Unf,TA,TA,Y,,,,WD
2048,RL,Pave,,Reg,Bnk,AllPub,Inside,Gtl,Crawfor,Norm,...,TA,Detchd,Unf,Fa,Fa,Y,,,,WD
2049,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,...,Gd,Attchd,Unf,TA,TA,Y,,,,WD


In [13]:
ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit(train[problem_cols])

train_ohe = pd.DataFrame(
    ohe.transform(train[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)

train_ohe.head(3)

Unnamed: 0,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,Neighborhood_GrnHill,...,Condition 2_PosN,Condition 2_RRAe,Condition 2_RRAn,Condition 2_RRNn,MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,MS Zoning_RM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
test_ohe = pd.DataFrame(
    ohe.transform(test[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)

full_train = pd.concat([train, train_ohe], axis = 1)
full_test = pd.concat([test, test_ohe], axis = 1)

# num_cols = train._get_numeric_data().drop(columns = ['Id', 'PID', 'SalePrice']).columns
# features = list(num_cols) + problem_cols
num_cols = high_corr_cols.drop('SalePrice')
features = list(num_cols) + problem_cols

In [15]:
X = train[features].fillna(0)
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
ohe.fit(X_train[problem_cols])

#convert  training problem columns
X_train_ohe = pd.DataFrame (
    ohe.transform(X_train[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)

X_test_ohe = pd.DataFrame (
    ohe.transform(X_test[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)

In [16]:
X_train_full = pd.concat([X_train.reset_index(drop = True), X_train_ohe], axis = 1).drop(columns = problem_cols)
X_test_full = pd.concat([X_test.reset_index(drop=True), X_test_ohe], axis = 1).drop(columns = problem_cols)

In [17]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LassoCV(verbose = True))
])

pipe.fit(X_train_full, y_train)
pipe.score(X_train_full, y_train), pipe.score(X_test_full, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


(0.8387982660244447, 0.8624339581655631)

In [18]:
rmse(pipe, X_train_full, y_train), rmse(pipe, X_test_full, y_test)

(31929.98006755344, 29063.01599824816)

In [19]:
test_ohe = pd.DataFrame(
    ohe.transform(test[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)

In [20]:
test_subset = pd.concat([test[features].drop(columns = problem_cols), test_ohe], axis = 1).fillna(0)

In [21]:
preds = pipe.predict(test_subset)

In [22]:
lasso_coefs = pipe.named_steps['lr'].coef_
coef_df = pd.DataFrame(lasso_coefs, index = X_train_full.columns, columns = ['coef val'])
coef_df.sort_values(by='coef val')

Unnamed: 0,coef val
Neighborhood_Edwards,-2159.335695
MS Zoning_RM,-2066.842978
Neighborhood_BrDale,-2058.596618
Neighborhood_NPkVill,-1534.549500
Neighborhood_NWAmes,-1052.352946
...,...
BsmtFin SF 1,7732.534217
Neighborhood_StoneBr,9611.298232
Neighborhood_NridgHt,12152.419751
Gr Liv Area,15599.796560


In [23]:
sum(coef_df['coef val'] == 0)

20

In [24]:
test_ohe = pd.DataFrame(
    ohe.transform(test[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)
test_subset = pd.concat([test[features].drop(columns = problem_cols), test_ohe], axis = 1).fillna(0)
preds = pipe.predict(test_subset)

In [25]:
preds = pd.DataFrame(
    pipe.predict(test_subset),
    columns = ['SalePrice']
)
preds = pd.concat([pd.DataFrame(test.Id, columns = ['Id']), preds], axis = 1)

In [26]:
preds.to_csv('../datasets/submission_lasso.csv', index_label = False, index = False, columns = preds.columns)

---------------------

In [27]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', RidgeCV())
])

pipe.fit(X_train_full, y_train)
pipe.score(X_train_full, y_train), pipe.score(X_test_full, y_test)

(0.8405682138237353, 0.8610419257945431)

In [28]:
rmse(pipe, X_train_full, y_train), rmse(pipe, X_test_full, y_test)

(31754.20533500198, 29209.69038004144)

In [29]:
test_ohe = pd.DataFrame(
    ohe.transform(test[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)

In [30]:
test_subset = pd.concat([test[features].drop(columns = problem_cols), test_ohe], axis = 1).fillna(0)

In [31]:
preds = pipe.predict(test_subset)

In [32]:
ridge_coefs = pipe.named_steps['lr'].coef_
coef_df = pd.DataFrame(ridge_coefs, index = X_train_full.columns, columns = ['coef val'])
coef_df.sort_values(by='coef val')

Unnamed: 0,coef val
Neighborhood_Edwards,-4615.600488
Neighborhood_NWAmes,-3336.622281
Neighborhood_BrDale,-3106.190543
Neighborhood_NAmes,-2954.250222
Neighborhood_NPkVill,-2792.244736
...,...
BsmtFin SF 1,8199.071394
Neighborhood_StoneBr,9085.497319
Neighborhood_NridgHt,10932.244910
Gr Liv Area,15117.789287


In [33]:
sum(coef_df['coef val'] == 0)

0

In [34]:
test_ohe = pd.DataFrame(
    ohe.transform(test[problem_cols]),
    columns = ohe.get_feature_names(problem_cols)
)
test_subset = pd.concat([test[features].drop(columns = problem_cols), test_ohe], axis = 1).fillna(0)
preds = pipe.predict(test_subset)

In [35]:
preds = pd.DataFrame(
    pipe.predict(test_subset),
    columns = ['SalePrice']
)
preds = pd.concat([pd.DataFrame(test.Id, columns = ['Id']), preds], axis = 1)

In [36]:
preds.to_csv('../datasets/submission_ridge.csv', index_label = False, index = False, columns = preds.columns)