## Titanic: Predicting `NaN` values using Random Forest instead of imputing

- This method of imputing is time taking and requires more effort than using simple imputers or dropping them, but it also works out of the box.

In [1]:
import os
os.chdir("E:\Machine learning folder\Datasets")

os.listdir("E:\Machine learning folder\Datasets")

['CaliforniaHousing.csv',
 'Chapter 3 - wine.csv',
 'MelbourneTest.csv',
 'MelbourneTrain.csv']

### How ?

- Select the columns that has missing vlaues
- Seperate the rows without missing values as a training data and with missing values as test data
- Train your model on the training data then use it to infer missing values in test data
- In the last merge them all and finally use it to predict the final target

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split


X_full = pd.read_csv("MelbourneTrain.csv", index_col='Id')
X_test_full = pd.read_csv("MelbourneTest.csv", index_col='Id')



X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])


### Using Random Forest and GBM for missing values prediction in columns

In [None]:

cols_with_missing_value = [col for col in X.columns if X[col].isnull().any()]
print(cols_with_missing_value)
cols_with_all = [col for col in X.columns if not X[col].isnull().any()]
print(cols_with_all)

In [4]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def nan_predictor_train(col):
    cols_with_all = [col for col in X.columns if not X[col].isnull().any()]
    cols_used = cols_with_all[:]
    cols_used.append(col)
    data = X[cols_used]
    print(f'before deleting nan values: {data.shape}')
    data_train = data.dropna(axis=0)
    print(f'after deleting nan values: {data_train.shape}')
    target_train = data_train[col]
    features_train = data_train.drop([col],axis=1)
    
    rf_model = RandomForestRegressor(n_estimators=500, random_state=0)
    rf_model.fit(features_train, target_train)
    gmb_model = GradientBoostingRegressor(n_estimators=500, random_state=0)
    gmb_model.fit(features_train, target_train)
    
    features_test = data[cols_with_all]
    pred1 = rf_model.predict(features_test)
    pred2 = gmb_model.predict(features_test)
    pred = (pred1 + pred2) / 2
    X[col] = pred


In [5]:
for col in cols_with_missing_value:
    nan_predictor_train(col)

before deleting nan values: (1460, 35)
after deleting nan values: (1201, 35)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1460, 36)
after deleting nan values: (1452, 36)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1460, 37)
after deleting nan values: (1379, 37)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
cols_with_nan = [col for col in X_test.columns if X_test[col].isnull().any()]
print(cols_with_nan)
cols_with_all = [col for col in X_test.columns if not X_test[col].isnull().any()]
print(cols_with_all)

['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def nan_predictor_test(col):
    cols_with_all = [col for col in X_test.columns if not X_test[col].isnull().any()]
    cols_used = cols_with_all[:]
    cols_used.append(col)
    data = X_test[cols_used]
    print(f'before deleting nan values: {data.shape}')
    data_train = data.dropna(axis=0)
    print(f'after deleting nan values: {data_train.shape}')
    target_train = data_train[col]
    features_train = data_train.drop([col],axis=1)
    
    rf_model = RandomForestRegressor(n_estimators=500, random_state=0)
    rf_model.fit(features_train, target_train)
    gmb_model = GradientBoostingRegressor(n_estimators=500, random_state=0)
    gmb_model.fit(features_train, target_train)
    
    features_test = data[cols_with_all]
    pred1 = rf_model.predict(features_test)
    pred2 = gmb_model.predict(features_test)
    pred = (pred1 + pred2) / 2
    X_test[col] = pred


In [8]:
for col in ['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea']:
    nan_predictor_test(col)

before deleting nan values: (1459, 26)
after deleting nan values: (1232, 26)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 27)
after deleting nan values: (1444, 27)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 28)
after deleting nan values: (1458, 28)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 29)
after deleting nan values: (1458, 29)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 30)
after deleting nan values: (1458, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 31)
after deleting nan values: (1458, 31)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 32)
after deleting nan values: (1457, 32)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 33)
after deleting nan values: (1457, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 34)
after deleting nan values: (1381, 34)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 35)
after deleting nan values: (1458, 35)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


before deleting nan values: (1459, 36)
after deleting nan values: (1458, 36)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
object_cols = [col for col in X_full.columns if X_full[col].dtype == 'object']

### Remove the object columns that have missing values

In [10]:
object_col_with_nan = [col for col in object_cols if X_full[col].isnull().any()]
object_col_with_nan

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [11]:
useful_object_col = list(set(object_cols) - set(object_col_with_nan))
len(useful_object_col)

27

In [12]:
good_cols = [col for col in useful_object_col if set(X_full[col]).issubset(set(X_test_full[col]))]
good_cols

['HeatingQC',
 'MSZoning',
 'LotConfig',
 'LotShape',
 'Functional',
 'KitchenQual',
 'LandContour',
 'Foundation',
 'Street',
 'ExterCond',
 'PavedDrive',
 'BldgType',
 'SaleCondition',
 'LandSlope',
 'CentralAir',
 'SaleType',
 'ExterQual',
 'RoofStyle',
 'Neighborhood',
 'Condition1']

In [13]:
data_object_train = X_full[good_cols]
data_object_train.head()

Unnamed: 0_level_0,HeatingQC,MSZoning,LotConfig,LotShape,Functional,KitchenQual,LandContour,Foundation,Street,ExterCond,PavedDrive,BldgType,SaleCondition,LandSlope,CentralAir,SaleType,ExterQual,RoofStyle,Neighborhood,Condition1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Ex,RL,Inside,Reg,Typ,Gd,Lvl,PConc,Pave,TA,Y,1Fam,Normal,Gtl,Y,WD,Gd,Gable,CollgCr,Norm
2,Ex,RL,FR2,Reg,Typ,TA,Lvl,CBlock,Pave,TA,Y,1Fam,Normal,Gtl,Y,WD,TA,Gable,Veenker,Feedr
3,Ex,RL,Inside,IR1,Typ,Gd,Lvl,PConc,Pave,TA,Y,1Fam,Normal,Gtl,Y,WD,Gd,Gable,CollgCr,Norm
4,Gd,RL,Corner,IR1,Typ,Gd,Lvl,BrkTil,Pave,TA,Y,1Fam,Abnorml,Gtl,Y,WD,TA,Gable,Crawfor,Norm
5,Ex,RL,FR2,IR1,Typ,Gd,Lvl,PConc,Pave,TA,Y,1Fam,Normal,Gtl,Y,WD,Gd,Gable,NoRidge,Norm


In [14]:
data_object_train = X_full[good_cols]
data_object_train.head()
data_object_test = X_test_full[good_cols]

In [15]:
train_target = X.SalePrice
train_features = X.drop(['SalePrice'], axis=1)

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_target, train_size=0.8, test_size=0.2,random_state=0)

### using Random Forest and Gradient Boost model via stacking for final predictions

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

In [18]:
def final_pred(n):
    rf_model = RandomForestRegressor(random_state=1, n_estimators=n)
    rf_model.fit(X_train,y_train)
    rf_pred = rf_model.predict(X_valid)

    gbm_model = GradientBoostingRegressor(random_state=1, n_estimators=n)
    gbm_model.fit(X_train,y_train)
    gbm_pred = gbm_model.predict(X_valid)

    pred = (rf_pred + gbm_pred)/2
    return pred

In [19]:
for n in [500]:
    pred = final_pred(n)
    print(np.sqrt(mean_squared_error(y_valid,pred)))

29710.562824388515


In [20]:
rf_model = RandomForestRegressor(random_state=1, n_estimators=700)
rf_model.fit(X_train,y_train)
rf_pred = rf_model.predict(X_test)

gbm_model = GradientBoostingRegressor(random_state=1, n_estimators=700)
gbm_model.fit(X_train,y_train)
gbm_pred = gbm_model.predict(X_test)

pred = (rf_pred + gbm_pred)/2

In [21]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)

---




*Have questions or comments? Visit the [Learn Discussion forum](https://www.kaggle.com/learn-forum/161289) to chat with other Learners.*