In [1]:
import pandas as pd

DATA_FILENAME = '../../data/train.csv'

# Read data

- Read data

In [3]:
df_raw = pd.read_csv(DATA_FILENAME)

- Filter data

In [5]:
label_col = 'SalePrice' # 'sale_price'
kitchen_quality_column = 'KitchenQual'
useful_features = ['Foundation', kitchen_quality_column, 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

In [6]:
df = df_raw[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
0,PConc,Gd,8,0,2008,856,208500
1,CBlock,TA,6,298,2007,1262,181500
2,PConc,Gd,6,0,2008,920,223500
3,BrkTil,Gd,7,0,2006,961,140000
4,PConc,Gd,9,192,2008,1145,250000


# Preprocess data

In [12]:
continuous_features = ['TotRmsAbvGrd', 'YrSold', '1stFlrSF', 'WoodDeckSF']
discrete_features = ['Foundation', kitchen_quality_column]
features_for_one_hot_encoding = ['Foundation']
features_for_ordinal_encoding = [kitchen_quality_column]

- check the values and the count of a given feature

In [8]:
df['Foundation'].value_counts()

Foundation
PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: count, dtype: int64

In [63]:
df['TotRmsAbvGrd'].value_counts()

TotRmsAbvGrd
6     402
7     329
5     275
8     187
4      97
9      75
10     47
11     18
3      17
12     11
2       1
14      1
Name: count, dtype: int64

In [64]:
df[kitchen_quality_column].value_counts()

KitchenQual
TA    735
Gd    586
Ex    100
Fa     39
Name: count, dtype: int64

In [10]:
df['1stFlrSF'].value_counts()

1stFlrSF
864     25
1040    16
912     14
894     12
848     12
        ..
1509     1
2515     1
605      1
3138     1
1256     1
Name: count, Length: 753, dtype: int64

In [11]:
df['WoodDeckSF'].value_counts()

WoodDeckSF
0      761
192     38
100     36
144     33
120     31
      ... 
326      1
179      1
103      1
176      1
736      1
Name: count, Length: 274, dtype: int64

## Encoding

### One hot encoding

- encoding using pd.get_dummies (never use for production, should only be used for modeling)

In [13]:
df_one_hot_code = pd.get_dummies(df[features_for_one_hot_encoding])
df_one_hot_code.head()

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
0,False,False,True,False,False,False
1,False,True,False,False,False,False
2,False,False,True,False,False,False
3,True,False,False,False,False,False
4,False,False,True,False,False,False


- With an object OneHotEncoder (what should be used for production)

In [43]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)

In [44]:
one_hot_encoder.fit(df[features_for_one_hot_encoding])

In [45]:
one_hot_encoder.categories_

[array(['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], dtype=object)]

In [46]:
one_hot_encoder.transform(df[features_for_one_hot_encoding])

array([[0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

### Ordinal encoding

In [18]:
df[kitchen_quality_column].value_counts()
"""
1. Ex = excellent
2. Gd = Good
3. TA = typical
4. Fa = Fair
"""


KitchenQual
TA    735
Gd    586
Ex    100
Fa     39
Name: count, dtype: int64

In [26]:
features_for_ordinal_encoding

['KitchenQual']

In [27]:
"""
1. Ex = excellent
2. Gd = Good
3. TA = typical
4. Fa = Fair
"""
kitchen_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}
kitchen_quality_encoded = df[kitchen_quality_column].apply(lambda x: kitchen_quality_dict[x])
kitchen_quality_encoded.head()

0    3
1    2
2    3
3    3
4    3
Name: KitchenQual, dtype: int64

### Examples of using apply

In [35]:
kitchen_quality_dict['Ex']

4

In [32]:
kitchen_quality_encoded.head(15)

0     3
1     2
2     3
3     3
4     3
5     2
6     3
7     2
8     2
9     2
10    2
11    4
12    2
13    3
14    2
Name: KitchenQual, dtype: int64

In [34]:
kitchen_quality_encoded.apply(lambda x: x * 2).head(15)

0     6
1     4
2     6
3     6
4     6
5     4
6     6
7     4
8     4
9     4
10    4
11    8
12    4
13    6
14    4
Name: KitchenQual, dtype: int64

In [23]:
df[kitchen_quality_column]

0       Gd
1       TA
2       Gd
3       Gd
4       Gd
        ..
1455    TA
1456    TA
1457    Gd
1458    Gd
1459    TA
Name: KitchenQual, Length: 1460, dtype: object

In [37]:
df[kitchen_quality_column].apply(lambda x: x.lower()).head(15)

0     gd
1     ta
2     gd
3     gd
4     gd
5     ta
6     gd
7     ta
8     ta
9     ta
10    ta
11    ex
12    ta
13    gd
14    ta
Name: KitchenQual, dtype: object

## Scaling

In [52]:
continuous_features

['TotRmsAbvGrd', 'YrSold', '1stFlrSF', 'WoodDeckSF']

In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[continuous_features])

In [55]:
scaled_data = scaler.transform(df[continuous_features])
scaled_data

array([[ 0.91220977,  0.13877749, -0.79343379, -0.75217584],
       [-0.31868327, -0.61443862,  0.25714043,  1.62619479],
       [-0.31868327,  0.13877749, -0.62782603, -0.75217584],
       ...,
       [ 1.52765629,  1.64520971,  0.06565646, -0.75217584],
       [-0.93412978,  1.64520971, -0.21898188,  2.16891024],
       [-0.31868327,  0.13877749,  0.2416147 ,  5.12192075]])

In [56]:
type(scaled_data)

numpy.ndarray

In [54]:
df[continuous_features].head()

Unnamed: 0,TotRmsAbvGrd,YrSold,1stFlrSF,WoodDeckSF
0,8,2008,856,0
1,6,2007,1262,298
2,6,2008,920,0
3,7,2006,961,0
4,9,2008,1145,192


- Connvert to a dataframe so that we can merge it with the others (df_one_hot_encoded, df_ordinl_encoded)

In [58]:
scaled_df = pd.DataFrame(data=scaled_data, columns=continuous_features)
scaled_df.head()

Unnamed: 0,TotRmsAbvGrd,YrSold,1stFlrSF,WoodDeckSF
0,0.91221,0.138777,-0.793434,-0.752176
1,-0.318683,-0.614439,0.25714,1.626195
2,-0.318683,0.138777,-0.627826,-0.752176
3,0.296763,-1.367655,-0.521734,-0.752176
4,1.527656,0.138777,-0.045611,0.780197


## Concat all the dataframes

In [63]:
X = pd.concat([df_one_hot_code, kitchen_quality_encoded, scaled_df], axis=1)
X

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual,TotRmsAbvGrd,YrSold,1stFlrSF,WoodDeckSF
0,False,False,True,False,False,False,3,0.912210,0.138777,-0.793434,-0.752176
1,False,True,False,False,False,False,2,-0.318683,-0.614439,0.257140,1.626195
2,False,False,True,False,False,False,3,-0.318683,0.138777,-0.627826,-0.752176
3,True,False,False,False,False,False,3,0.296763,-1.367655,-0.521734,-0.752176
4,False,False,True,False,False,False,3,1.527656,0.138777,-0.045611,0.780197
...,...,...,...,...,...,...,...,...,...,...,...
1455,False,False,True,False,False,False,2,0.296763,-0.614439,-0.542435,-0.752176
1456,False,True,False,False,False,False,2,0.296763,1.645210,2.355701,2.033231
1457,False,False,False,False,True,False,3,1.527656,1.645210,0.065656,-0.752176
1458,False,True,False,False,False,False,3,-0.934130,1.645210,-0.218982,2.168910


In [64]:
X.shape[0]

1460

In [61]:
print(df_one_hot_code.shape, kitchen_quality_encoded.shape, scaled_df.shape)

(1460, 6) (1460,) (1460, 4)


In [67]:
assert X.shape[0] == df_one_hot_code.shape[0]

# Training

- Create X and y

In [68]:
#X, y = df_final.drop(columns=[label_col]), df_final[label_col]
y = df[label_col]

In [70]:
display(y.head())
display(X.head())

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual,TotRmsAbvGrd,YrSold,1stFlrSF,WoodDeckSF
0,False,False,True,False,False,False,3,0.91221,0.138777,-0.793434,-0.752176
1,False,True,False,False,False,False,2,-0.318683,-0.614439,0.25714,1.626195
2,False,False,True,False,False,False,3,-0.318683,0.138777,-0.627826,-0.752176
3,True,False,False,False,False,False,3,0.296763,-1.367655,-0.521734,-0.752176
4,False,False,True,False,False,False,3,1.527656,0.138777,-0.045611,0.780197


- Split data in X_train, y_train, X_test, y_test https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [102]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [88]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(978, 11) (978,)
(482, 11) (482,)


In [89]:
X_train

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual,TotRmsAbvGrd,YrSold,1stFlrSF,WoodDeckSF
615,False,False,True,False,False,False,3,-0.318683,1.645210,-0.281085,0.205557
613,False,False,True,False,False,False,2,-0.318683,-0.614439,-0.110302,-0.752176
1303,False,False,True,False,False,False,3,0.296763,-1.367655,1.173158,0.907895
486,False,True,False,False,False,False,2,-0.318683,-0.614439,-0.231920,-0.752176
561,False,True,False,False,False,False,2,-0.318683,-1.367655,0.585768,1.163290
...,...,...,...,...,...,...,...,...,...,...,...
1095,False,False,True,False,False,False,3,-0.318683,-0.614439,0.391697,-0.752176
1130,True,False,False,False,False,False,3,0.296763,0.891994,0.427923,2.687682
1294,False,True,False,False,False,False,2,-0.934130,-1.367655,-0.772733,-0.752176
860,True,False,False,False,False,False,3,0.296763,-0.614439,-0.648527,-0.752176


In [91]:
X_test

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual,TotRmsAbvGrd,YrSold,1stFlrSF,WoodDeckSF
892,False,True,False,False,False,False,2,-0.318683,-1.367655,-0.244858,0.780197
1105,False,False,True,False,False,False,3,1.527656,1.645210,0.872994,0.732311
413,False,True,False,False,False,False,2,-0.934130,1.645210,-0.348363,-0.752176
522,False,True,False,False,False,False,2,0.296763,-1.367655,-0.410466,-0.752176
1036,False,False,True,False,False,False,4,-0.318683,0.891994,1.183509,1.067517
...,...,...,...,...,...,...,...,...,...,...,...
1010,False,True,False,False,False,False,2,0.296763,0.138777,-1.145350,-0.752176
390,True,False,False,False,False,False,2,0.296763,0.138777,-0.521734,-0.752176
1409,False,True,False,False,False,False,2,0.296763,0.138777,0.189862,2.152948
847,False,True,False,False,False,False,2,-0.934130,0.891994,-0.772733,-0.752176


- Instantiate a LinearRegressor https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [92]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

- Train the model (model.fit ...)

In [93]:
model.fit(X_train, y_train)

# Evaluation

- make predictions (`y_pred = model.predict(X_test)`)

In [95]:
y_pred = model.predict(X_test)
y_pred[:5]

array([143355.9868003 , 272539.03665861, 119252.47546508, 141642.14917919,
       283321.03297773])

In [99]:
y_pred

array([143355.9868003 , 272539.03665861, 119252.47546508, 141642.14917919,
       283321.03297773,  77722.71793735, 202702.99663352, 136684.76149811,
        71018.96652635, 180881.48957994, 110857.56977846, 112343.19681511,
       123816.9423292 , 238511.9596971 , 170724.34579428, 169920.13197755,
       205178.04690791, 136444.68217435, 117363.23268544, 242407.05608748,
       197743.86422207, 184711.99844479, 200465.84473672, 101574.29800425,
       219766.00373534, 194063.5940105 , 219673.46810135,  97661.29276251,
       164260.02612991, 201963.11918855, 144527.4862615 , 239798.973457  ,
       327377.63410467, 101612.63280336, 255432.42993124, 148505.83737926,
       150062.82500428, 218236.7042917 , 276818.80620204, 123834.066193  ,
       151138.78038798, 234179.48267317, 113766.55357129, 318651.86260854,
       115831.75078221, 111418.12732043, 111285.74539974, 126716.51636503,
       348657.25292188, 134992.61178669, 128509.97710083, 229485.53549501,
       117392.5348062 , 3

In [97]:
print(len(y_pred), len(X_test))

482 482


- compute performance using https://scikit-learn.org/stable/modules/generated/sklearn.metrics.root_mean_squared_error.html#sklearn.metrics.root_mean_squared_error

In [98]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_test, y_pred)

46821.542160316065

In [None]:
def computeRmse():
    pass



In [105]:
def compute_rmse():
    a = 1 + 2
    a = b
    return a

In [106]:
compute_rmse()

NameError: name 'b' is not defined

In [107]:
def compute_rmse_2():
    a = 1 + 2
    if False:
        a = b
    return a

In [108]:
compute_rmse_2()

3