In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
import warnings
warnings.filterwarnings('ignore')

_df = pd.read_csv('data/train.csv')
_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

incomplete observations:

 3   LotFrontage    1201 non-null   float64

 6   Alley          91 non-null     object 

 30  BsmtQual       1423 non-null   object 

 31  BsmtCond       1423 non-null   object 

 32  BsmtExposure   1422 non-null   object 

 33  BsmtFinType1   1423 non-null   object 

 35  BsmtFinType2   1422 non-null   object 

 42  Electrical     1459 non-null   object 

 57  FireplaceQu    770 non-null    object 

 58  GarageType     1379 non-null   object 

 59  GarageYrBlt    1379 non-null   float64

 60  GarageFinish   1379 non-null   object

 72  PoolQC         7 non-null      object 

 73  Fence          281 non-null    object 
 
 74  MiscFeature    54 non-null     object

In [90]:
def clean_data(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Categorical features
    df['MSSubClass'] = df.MSSubClass.astype(str)
    df['MoSold'] = df['MoSold'].astype(str)

    # Numerical features
    df.LotFrontage.fillna(0, inplace=True)
    df.MasVnrArea.fillna(0, inplace=True)
    df.GarageYrBlt.fillna(-1, inplace=True)

    # Should be NA not NAN
    df.Alley.fillna('NA', inplace=True)
    df.BsmtQual.fillna('NA', inplace=True)
    df.BsmtCond.fillna('NA', inplace=True)
    df.BsmtExposure.fillna('NA', inplace=True)
    df.BsmtFinType1.fillna('NA', inplace=True)
    df.BsmtFinType2.fillna('NA', inplace=True)
    df.FireplaceQu.fillna('NA', inplace=True)
    df.GarageType.fillna('NA', inplace=True)
    df.GarageFinish.fillna('NA', inplace=True)
    df.PoolQC.fillna('NA', inplace=True)
    df.Fence.fillna('NA', inplace=True)
    df.MiscFeature.fillna('NA', inplace=True)

    df = df.drop(['Id', 'Utilities'], axis=1)

    return df

In [91]:
def get_numerical_features(df : pd.DataFrame) -> list:
    numeric_dtypes = ['int64', 'float64']
    numeric = []
    for i in df.columns:
        if df[i].dtype in numeric_dtypes:
            numeric.append(i)
    return numeric

In [92]:

def preprocess(df : pd.DataFrame) -> pd.DataFrame:
    numeric = get_numerical_features(df)
    num = df[numeric]

    mapper = DataFrameMapper([(num.columns, StandardScaler())]).fit(num)
    scaled_features = mapper.transform(num)
    num = pd.DataFrame(scaled_features, index=num.index, columns=num.columns)
    num.head()

    cat = df[df.columns[~df.columns.isin(numeric)]]
    cat = pd.get_dummies(cat)

    df = pd.concat([num, cat], axis=1)
    return df, numeric

In [93]:
def XY_split(df : pd.DataFrame, unscaled_df : pd.DataFrame = None) -> tuple:
    df = df.drop('SalePrice', axis=1)
    if type(unscaled_df) == pd.DataFrame:
        y = unscaled_df.SalePrice
    else: 
        y = df.SalePrice
    return df, y

In [94]:
df = _df.copy()
df = clean_data(df)
df, numeric = preprocess(df)
X, y = XY_split(df, _df)

In [95]:
# No null observations!
assert (df.isnull().sum().sort_values(ascending=False).head(20)[0] == 0)

In [96]:
y.head(3)

0    208500
1    181500
2    223500
Name: SalePrice, dtype: int64

In [97]:
X.head(3)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,-0.944591,...,0,0,0,1,0,0,0,0,1,0
1,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,-0.641228,...,0,0,0,1,0,0,0,0,1,0
2,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,-0.301643,...,0,0,0,1,0,0,0,0,1,0


In [98]:
# Building features
def has_features(df):
    df = df.copy()

    df['3SsnPorch'] = df['3SsnPorch'].apply(lambda x: 1 if x > 0 else 0)
    df['ScreenPorch'] = df['ScreenPorch'].apply(lambda x: 1 if x > 0 else 0)
    df['Pool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['Garage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['EnclosedPorch'] = df['EnclosedPorch'].apply(lambda x: 1 if x > 0 else 0)
    df['2ndFloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    df['WoodDeck'] = df['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
    df['OpenPorch'] = df['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
    df['Bsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    df['Fireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

    return df

def interaction_terms(df, cols):
    # it's way too computationally expensive to do this on the whole dataset
    # so we'll just do it on the numerical columns
    df = df.copy()
    for index, i in enumerate(cols):
        print("Progress: {}/{}".format(index + 1, len(cols)), end='\r')
        for j in cols:
            if i != j:
                df[i + '*' + j] = df.apply(lambda row: row[i] * row[j], axis = 1)
    return df

In [99]:
X = has_features(X)
X = interaction_terms(X, [n for n in numeric if n != 'SalePrice'])
X['MSSubClass_150'] = 0 # There's 150 in the test set, but not the train set.
X.head(3)

Progress: 34/34

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,YrSold*GarageCars,YrSold*GarageArea,YrSold*WoodDeckSF,YrSold*OpenPorchSF,YrSold*EnclosedPorch,YrSold*3SsnPorch,YrSold*ScreenPorch,YrSold*PoolArea,YrSold*MiscVal,MSSubClass_150
0,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,-0.944591,...,0.04326,0.048711,-0.104385,0.030046,0.0,0.0,0.0,-0.009533,-0.012169,0
1,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,-0.641228,...,-0.191536,0.037315,-0.999197,0.432862,-0.0,-0.0,-0.0,0.042207,0.053879,0
2,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,-0.301643,...,0.04326,0.087669,-0.104385,-0.009765,0.0,0.0,0.0,-0.009533,-0.012169,0


In [100]:
# The multiple of two mean-zero variance-one features is also mean-zero variance-one
X['YrSold*GarageYrBlt'].describe()

count    1460.000000
mean       -0.011134
std         0.974549
min        -6.778807
25%        -0.185936
50%         0.028080
75%         0.212920
max         5.635189
Name: YrSold*GarageYrBlt, dtype: float64

In [101]:
# LASSO time!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
# define grid for alpha
alpha_grid = {'alpha': np.logspace(1, 4, 20)}
lasso_grid = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True, verbose=3)
best_lasso=lasso_grid.fit(X,y)
print("Best alpha: ",best_lasso.best_estimator_.get_params()['alpha'])
print("Train set score: {:.2f}".format(lasso_grid.score(X,y)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ....alpha=10.0;, score=(train=0.978, test=0.820) total time=   0.8s
[CV 2/5] END ....alpha=10.0;, score=(train=0.979, test=0.514) total time=   0.7s
[CV 3/5] END ....alpha=10.0;, score=(train=0.981, test=0.753) total time=   0.9s
[CV 4/5] END ....alpha=10.0;, score=(train=0.980, test=0.831) total time=   0.8s
[CV 5/5] END ....alpha=10.0;, score=(train=0.978, test=0.697) total time=   0.8s
[CV 1/5] END alpha=14.38449888287663;, score=(train=0.977, test=0.835) total time=   0.8s
[CV 2/5] END alpha=14.38449888287663;, score=(train=0.979, test=0.529) total time=   0.7s
[CV 3/5] END alpha=14.38449888287663;, score=(train=0.979, test=0.809) total time=   0.9s
[CV 4/5] END alpha=14.38449888287663;, score=(train=0.979, test=0.841) total time=   0.8s
[CV 5/5] END alpha=14.38449888287663;, score=(train=0.977, test=0.680) total time=   0.6s
[CV 1/5] END alpha=20.6913808111479;, score=(train=0.976, test=0.849) total time=  

In [102]:
# Testing time!
_test_df = pd.read_csv('data/test.csv')
_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [103]:
test_df = _test_df.copy()
test_df = clean_data(test_df)
test_df, test_numeric = preprocess(test_df)

test_X = has_features(test_df)
test_X[list(set(X.columns) - set(test_X.columns))] = 0
test_X = interaction_terms(test_X, [n for n in test_numeric if n != 'SalePrice'])
test_X.fillna(0, inplace=True)
test_X.head(3)

Progress: 34/34

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,LotFrontage*2ndFlrSF,YearRemodAdd*GarageYrBlt,YrSold*HalfBath,OverallCond*TotalBsmtSF,YearBuilt*1stFlrSF,GarageArea*BsmtHalfBath,OverallQual*GarageArea,HalfBath*GarageArea,BsmtHalfBath*TotalBsmtSF,GarageYrBlt*OverallQual
0,0.684849,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,0.063273,0.517171,-0.650396,...,-0.530931,-0.214317,-1.287211,-0.148557,0.223169,-0.306282,-0.890459,-0.890386,0.095765,-0.150039
1,0.715852,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.047057,1.063027,-0.2978,-0.339262,...,-0.554966,-0.234513,2.121211,0.25606,-0.190519,0.191426,0.040662,-0.917046,-0.165066,-0.010593
2,0.498831,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.563316,0.772989,-0.2978,-0.954667,...,0.444929,0.190388,2.121211,0.132703,-0.484629,-0.010991,-0.031956,0.052656,0.068923,-0.210685


In [104]:
assert (test_X.isnull().sum().sort_values(ascending=False).head(20)[0] == 0)

In [109]:
pred = best_lasso.predict(test_X)

In [108]:
SAMPLE = pd.read_csv('data/sample_submission.csv')
ids = SAMPLE.Id


In [110]:
pred = pd.DataFrame(pred, index=ids, columns=['SalePrice'])
pred.to_csv('submission.csv')

In [None]:
# That's all!