Housing Prices Competition for Kaggle Learn Users
Apply what you learned in the Machine Learning course on Kaggle Learn alongside others in the course.

https://www.kaggle.com/c/home-data-for-ml-course

### Import the libraries 

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.impute import SimpleImputer 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt 
import math

print(pd.__version__)

### Read the data

In [None]:
datadir = '../../../data/Housing-Prices-Competition-for-Kaggle-Learn-Users/'
sample_sub = pd.read_csv(datadir + 'sample_submission.csv')
train1 = pd.read_csv(datadir + 'train.csv')
test1 = pd.read_csv(datadir + 'test.csv')

data1 = pd.concat([train1, test1], join='outer', ignore_index=True)

In [None]:
print(train1.shape)
print(test1.shape)
print(data1.shape)

### Check NAN
partially referred from https://www.kaggle.com/drwilliamssteven/housing-data-random-forest 

In [None]:
plt.figure(figsize=(20, 20))

sns.heatmap(data1.isnull(),
            square=False,
            annot=False,
            yticklabels=False,
            cbar=False,
            cmap='viridis'            
           )

plt.title('Features with missing values');

### Data preprocessing 

In [None]:
data = data1.copy()
data.drop('Id', axis=1, inplace=True)

# impute
for col in data.columns:
    # NaN ratio in a given column 
    na_ratio = data[col].isna().sum() / data[col].size
    
    # continue if no NaN, or col = 'SalePrice' 
    if na_ratio == 0 or col == 'SalePrice': 
        continue 
    # drop column if NaN ratio too large, > 0.2 here
    elif na_ratio > 0.2: 
        data.drop(col, axis=1, inplace=True)
        continue
    
    # use mean to impute numeric data
    if data[col].dtype in ['int64', 'float64']: 
        int_imputer = SimpleImputer(strategy='mean') 
        data[col] = int_imputer.fit_transform(data[col].values.reshape(-1, 1))
        data[col] = data[col].astype('int64')
    # use most_frequent to impute str data
    elif data[col].dtype == 'object': 
        str_imputer = SimpleImputer(strategy='most_frequent') 
        data[col] = str_imputer.fit_transform(data[col].values.reshape(-1, 1))

# get dummies for categorical data 
obj_cols = [col for col in data.columns if data[col].dtype == 'object']
obj_dummies = pd.get_dummies(data[obj_cols])
data.drop(obj_cols, axis=1, inplace=True)
data = pd.concat([data, obj_dummies], axis=1)

test_set = data[data.SalePrice.isna()]
train_set = data[data.SalePrice.notna()]

x = train_set.drop('SalePrice', axis=1)
y = train_set['SalePrice']
test_set.drop('SalePrice', axis=1, inplace=True)

### Do the fitting

In [None]:
train_part = True

if train_part:
    train_x, val_x, train_y, val_y = train_test_split(x, y, test_size = 0.05)
else:     
    train_x, train_y = x, y

train_model = RandomForestRegressor(n_estimators=600, criterion='mae', n_jobs=-1, 
                                    max_features = 0.5, min_samples_leaf = 2)
train_model.fit(train_x, train_y)

if train_part:
    pred_y = train_model.predict(val_x)

    print('average sale price:', train1.SalePrice.mean())
    print('MAE:', mean_absolute_error(val_y, pred_y))

    sns.distplot(val_y - pred_y)

### Generate output
Score: 15739, rank: ~445/3675, as of 03/06/2019

In [None]:
pred = train_model.predict(test_set)
sample_sub['SalePrice'] = pred
print(sample_sub.SalePrice.describe())
sample_sub.head()

In [None]:
sample_sub.to_csv('submission.csv', index=False)