Housing Prices Competition for Kaggle Learn Users
Apply what you learned in the Machine Learning course on Kaggle Learn alongside others in the course.

https://www.kaggle.com/c/home-data-for-ml-course

### Import the libraries 

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt 
import math

print(pd.__version__)

### Read the data

In [None]:
datadir = r'../../../data/Housing-Prices-Competition-for-Kaggle-Learn-Users/'
sample_sub = pd.read_csv(datadir + 'sample_submission.csv')
train1 = pd.read_csv(datadir + 'train.csv')
test1 = pd.read_csv(datadir + 'test.csv')

data = pd.concat([train1, test1], join='outer', ignore_index=True)

In [None]:
print(train1.shape)
print(test1.shape)
print(data.shape)

### Check NAN
partially referred from https://www.kaggle.com/drwilliamssteven/housing-data-random-forest 

In [None]:
plt.figure(figsize=(20, 20))

sns.heatmap(data.isnull(),
            square=False,
            annot=False,
            yticklabels=False,
            cbar=False,
            cmap='viridis'            
           )

plt.title('Features with missing values');

### Check NaN

In [None]:
print(
    train1.loc[:, train1.isna().sum() > 0].shape,
    test1.loc[:, test1.isna().sum() > 0].shape
)

### Fill NA for special data

In [None]:
#print("%s,\t%s,\t%s" % ('column', 'NaN ratio in train data', '1st Maj / 2nd Maj ratio in train') )
for col in data.columns[data.isna().sum() > 0]: 
    if col == 'SalePrice': 
        continue
    
    # only < 10% NaN ratio in All data
    na_ratio = data[col].isna().sum() / data[col].size
    
    # the 1st majority count is 2 times greater than 2nd majority count
    na_val_cnt = data[col].value_counts()
    maj_val = na_val_cnt.idxmax()
    if na_val_cnt.size >= 2:
        maj_second_maj_ratio = na_val_cnt.iloc[0] / na_val_cnt.iloc[1]
    else: 
        maj_second_maj_ratio = float('inf')

    # can be filled 
    if na_ratio < 0.1 and maj_second_maj_ratio > 1.4: 
        maj_val = na_val_cnt.idxmax()
        data[col] = data[col].fillna(maj_val)
        print("%s filled at %s, maj_ratio = %.3f" % (col, maj_val, maj_second_maj_ratio))
    # should be dropped 
    elif na_ratio > 0.1 or maj_second_maj_ratio < 1.4: 
        data = data.drop(col, axis = 1)
        print("%s dropped at NaN ratio %.3f" % (col, na_ratio))
    
    # need to be handled
    else:
        print("To be handled: %s\t%.3f\t%.3f" % (col, na_ratio, maj_second_maj_ratio) )
        
if data.columns[data.isna().sum() > 0].size <= 1: 
    print(data.columns[data.isna().sum() > 0])
    print('No more NaN!')

# convert data types
for col in data.columns: 
    if col != 'SalePrice' and data[col].dtype == 'float64': 
        data[col] = data[col].astype('int64')

### Plot KDE of a feature vs SalePrice

In [None]:
plt.figure(figsize = (10, 6.18))
sns.kdeplot(train1['SalePrice'], shade=True)

In [None]:
feat = 'MSZoning'
print(train1[feat].nunique())
sns.pairplot(train1[[feat, 'SalePrice']], hue=feat, height=7)

### GrLivArea:
'Above grade (ground) living area square feet' is highly correlated

In [None]:
feat1 = 'GrLivArea'
sns.pairplot(train1[[feat1, 'SalePrice']], height=5)

### Correlations
partially referred from https://www.kaggle.com/drwilliamssteven/housing-data-random-forest 

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(
    train1.corr(),
    vmax=0.8, 
    linewidths=0.01,
    square=True,
    annot=True,  
    cmap='YlGnBu',
    linecolor="white"
)

In [None]:
train1.corr().SalePrice.sort_values(ascending=False)

### Data preprocessing 

In [None]:
data_objs_column = list(data.columns[data.dtypes == 'object'])
data_dummies = pd.get_dummies(data[data_objs_column])
data1 = pd.concat([data, data_dummies], axis=1)
data1 = data1.drop(data_objs_column, axis = 1)

train_set = data1[data1.SalePrice.notna()]
val_set = data1[data1.SalePrice.isna()].drop('SalePrice', axis=1)
val_set = val_set.fillna(0)

x = train_set.drop('SalePrice', axis=1)
y = train_set.SalePrice

### Do the fitting

In [None]:
train_part = True

if train_part:
    train_x, val_x, train_y, val_y = train_test_split(x, y, test_size = 0.05)
else:     
    train_x, train_y = x, y

train_model = RandomForestRegressor(n_estimators=200, criterion='mae', n_jobs=-1, 
                                    max_features = 0.5, min_samples_leaf = 2)
train_model.fit(train_x, train_y)

if train_part:
    pred_y = train_model.predict(val_x)

    print('average sale price:', train1.SalePrice.mean())
    print('our MAE:', mean_absolute_error(val_y, pred_y))

    sns.distplot(val_y - pred_y)

### Generate output
Score: 16414, rank: 621/3617, as of 03/04/2019

In [None]:
pred = train_model.predict(val_set)
sample_sub['SalePrice'] = pred
sample_sub.describe()

In [None]:
sample_sub.to_csv('submission.csv', index=False)