In [16]:
# import 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# Read the data
# This data you can find here: https://www.kaggle.com/c/home-data-for-ml-course/data

X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# SalePrice is the target, if there is no target eliminate row associated with it
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X = X_full.copy()
X.drop(['SalePrice'], axis=1, inplace=True)



X_test = X_test_full.copy()


# Now we have X and y the target separate! 


Previously we got: MAE_1: 17690 vs MAE_0 17739
We want to do better now! With our last model
(first we clean the data to have relevant information to feed the model then we choose a good model)

# Parameters
this are some cleaning parameters that you can play with. 

In [17]:
# what is the max level of cardinality allow, after this point you delete column
delete_over = 10
# If a columns has too many null, we delete this column and add a column that show instead
# wether this column was null or not, this is the threshold 
# 0.1 means 10%, that is to say if it has more than 10% of it values null is more important to know if it is null, more than it actual value

col_to_change_to_null = 0.07

# cleaning Input

In [18]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [19]:
X.shape

(1460, 79)

In [20]:
categorical_variables = [col for col in  X.columns if str(X[col].dtypes)=='object']
numerical_variables = [col for col in X.columns if str(X[col].dtypes)!='object']

In [21]:
cardinalidad = {}
for col in categorical_variables:
    cardinalidad[col] = len(list(X[col].unique()))
print("cardinality of each of these categorical columns")
#dict(sorted(cardinalidad.items(), key=lambda item: item[1], reverse=True))

cardinality of each of these categorical columns


In [22]:
# For now we delete categories with more values than..
delete_over = delete_over
columns_to_delete = [col for col in categorical_variables if len(list(X[col].unique()))>delete_over ]
X.drop(columns=columns_to_delete,inplace = True, axis=1)

In [23]:
# by the other hand
null_serie = X.isnull().sum()
pd.DataFrame(null_serie[null_serie>0], columns = ['number_null'])

Unnamed: 0,number_null
LotFrontage,259
Alley,1369
MasVnrType,8
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1


In [24]:
# We are going to change columns with too many null.
# We are not gonna delete them, will give them the chance to be important.
# that means that having or not having the value is what is really important.
col_to_change_to_null = col_to_change_to_null
columnas_modificar_por_1 = [col for col in X.columns if X[col].isnull().sum()>int(X.shape[0] * col_to_change_to_null) ]

for col in columnas_modificar_por_1:
    X[col +str('_is_null')] = 0
    X.loc[(X[col].isnull()),col +str('_is_null')] = 1

new_columns_null = [str(f"{col}_is_null") for col in columnas_modificar_por_1 ]    
X.drop(columns=columnas_modificar_por_1, axis=1,inplace=True)
# this is for changing the original values, but can be hard to read afterwards    
if False:
    for col in columnas_modificar_por_1:
        X[col] = X[col].fillna('null') 
        X[col] = X[col].apply(lambda x: 1 if x== 'null' else 0)
        X[col] = X[col].astype('int32')


In [25]:
X.shape

(1460, 76)

In [26]:
X[new_columns_null].head()

Unnamed: 0_level_0,LotFrontage_is_null,Alley_is_null,FireplaceQu_is_null,PoolQC_is_null,Fence_is_null,MiscFeature_is_null
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,1,1,1,1,1
2,0,1,0,1,1,1
3,0,1,0,1,1,1
4,0,1,0,1,1,1
5,0,1,0,1,1,1


In [27]:
null_serie = X.isnull().sum()
pd.DataFrame(null_serie[null_serie>0], columns = ['number_null'])

Unnamed: 0,number_null
MasVnrType,8
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1
GarageType,81
GarageYrBlt,81


In [28]:
# separating the data in training/validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [29]:

numerical_col = [col for col in X_train.columns if str(X_train[col].dtypes)!='object' ]
numerical_col_imputed = [col for col in numerical_col if X_train[col].isnull().any()==True]

categorical_col = [col for col in X_train.columns if str(X_train[col].dtypes)=='object' ]
categorical_col_imputed = [col for col in categorical_col if X_train[col].isnull().any()==True]

numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer =  Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=
    [("numerical_transformer", numerical_transformer, numerical_col_imputed),
    ("categorical_transformer", categorical_transformer, categorical_col)],remainder='passthrough')

# Define model
model = RandomForestRegressor(n_estimators=50, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

pipe.fit(X_train,y_train)
preds = pipe.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17356.802123287674


notice we lower the MAE, it was just a small amount, but can we do even better?
Up next we are going to play with different models and this clean data will be our starting point.

In [31]:
#pipe.fit(X_train,y_train)