# Project 12: Predicting House Sale Prices
## Machine Learning

In this project, we will use machine learning models specifically Linear Regression to predict house sale prices. Several procedures will be used in this project such as Feature Engineering, Feature Selection and Modelling.

Datasets used in this project from:
* [Housing Data](https://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627)

We will first import the dataset and clean ineffectual outliers:

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('AmesHousing.tsv', sep='\t')

In [2]:
# remove recommended outliers
df = df[df['Gr Liv Area']<4000]
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


Feature Engineering:

In [3]:
def feature_engineering(df):
    # Remove text columns with any missing data
    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum()
    text_mv_cols = text_mv_counts[text_mv_counts > 0].index
    df = df.drop(text_mv_cols,axis=1)

    # Remove numerical columns with >5% missing data
    cutoff = df.shape[0]/20
    num_mv_counts = df.select_dtypes(include=['integer','float']).isnull().sum()
    num_drop_cols = num_mv_counts[num_mv_counts > cutoff].index
    df = df.drop(num_drop_cols,axis=1)
    
    # Find most common value for columns with missing data
    num_mv_counts = df.select_dtypes(include=['integer','float']).isnull().sum()
    num_mv_cols = num_mv_counts[num_mv_counts > 0].index
    fill_values = df[num_mv_cols].mode().to_dict(orient='records')[0]
    # Fill in the missing values
    df = df.fillna(fill_values)
    
    #Add Year-based features
    df['years_until_sold'] = df['Yr Sold'] - df['Year Built']
    df['years_since_remod'] = df['Yr Sold'] - df['Year Remod/Add']
    df.drop(1702,axis=0)
    
    # Drop year columns
    df = df.drop(['Year Remod/Add','Year Built'],axis=1)   
    # Remove non-useful columns or columns that leak sale data
    nu_cols = ['Order','PID']
    dl_cols = ['Mo Sold','Yr Sold','Sale Type','Sale Condition']
    df = df.drop(nu_cols,axis=1)
    df = df.drop(dl_cols,axis=1)
    return df

In [4]:
df = feature_engineering(df)
df.head()

Unnamed: 0,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,SalePrice,years_until_sold,years_since_remod
0,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,...,210,62,0,0,0,0,0,215000,50,50
1,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,140,0,0,0,120,0,0,105000,49,49
2,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,...,393,36,0,0,0,0,12500,172000,52,52
3,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,Gtl,NAmes,...,0,0,0,0,0,0,0,244000,42,42
4,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,212,34,0,0,0,0,0,189900,13,12


Feature Selection:

In [5]:
def feature_selection(df,correlation_threshold=0.4,uniqueness_threshold=10):
    # Drop features that are highly correlated with other features
    df = df.drop(['Garage Cars', 'Total Bsmt SF', 'TotRms AbvGrd'],axis=1)
    
    num_df = df.select_dtypes(include=['float','integer'])
    correlations = num_df.corr()['SalePrice'].abs().sort_values()
    # Drop numerical columns with "SalePrice" correlation < our threshold.
    df = df.drop(correlations[correlations < correlation_threshold].index,axis=1)
    
    # Select categorical columns.
    nominal_cols = ["PID","MS SubClass","MS Zoning","Street","Alley","Land Contour","Lot Config","Neighborhood","Condition 1","Condition 2","Bldg Type","House Style","Roof Style","Roof Matl","Exterior 1st","Exterior 2nd","Mas Vnr Type","Foundation","Heating","Central Air","Garage Type","Misc Feature","Sale Type"]
    ordinal_cols = ["Lot Shape","Utilities","Land Slope","Exter Qual","Exter Cond","Bsmt Qual","Bsmt Cond","Bsmt Exposure","BsmtFin Type 1","BsmtFin Type 2","Heating QC","Electrical","Kitchen Qual","Functional","Fireplace Qu","Garage Finish","Garage Qual","Garage Cond","Paved Drive","Pool QC","Fence","Sale Condition"]
    cat_cols = nominal_cols + ordinal_cols
    
    # Transform categorical columns.
    transform_cat_cols = []
    for column in cat_cols:
        if column in df.columns:
            transform_cat_cols.append(column)
    cat_stats = df[transform_cat_cols].describe()
    unique_stats = cat_stats.loc['unique']

    # Drop categorical columns with more unique values than our threshold
    df = df.drop(unique_stats[unique_stats > uniqueness_threshold].index,axis=1)

    # Get rid of any categorical columns where the most frequent value is more than 95% of the total
    top_percent_of_total = cat_stats.loc['freq']/cat_stats.loc['count']
    df = df.drop(top_percent_of_total[top_percent_of_total > .95].index,axis=1)
    
    # Convert to category columns and create dummy columns
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([df,pd.get_dummies(df.select_dtypes(include=['category']))],axis=1)
    
    
    return df

In [6]:
df = feature_selection(df)
df.head()

Unnamed: 0,MS Zoning,Lot Shape,Land Contour,Lot Config,Condition 1,Bldg Type,House Style,Overall Qual,Roof Style,Mas Vnr Area,...,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sal,Functional_Sev,Functional_Typ,Paved Drive_N,Paved Drive_P,Paved Drive_Y
0,RL,IR1,Lvl,Corner,Norm,1Fam,1Story,6,Hip,112.0,...,0,0,0,0,0,0,1,0,1,0
1,RH,Reg,Lvl,Inside,Feedr,1Fam,1Story,5,Gable,0.0,...,0,0,0,0,0,0,1,0,0,1
2,RL,IR1,Lvl,Corner,Norm,1Fam,1Story,6,Hip,108.0,...,0,0,0,0,0,0,1,0,0,1
3,RL,Reg,Lvl,Corner,Norm,1Fam,1Story,7,Hip,0.0,...,0,0,0,0,0,0,1,0,0,1
4,RL,IR1,Lvl,Inside,Norm,1Fam,2Story,5,Gable,0.0,...,0,0,0,0,0,0,1,0,0,1


We will finally start modelling (Linear Regression) using K-Fold with Root-mean-square Error (RMSE) as the final output:

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def train_and_test(df,folds=10):
    numeric_df = df.select_dtypes(include=['integer','float'])
    train_cols = numeric_df.columns.drop('SalePrice')
    lr = LinearRegression()
    kf = KFold(folds,shuffle=True)
    mses = cross_val_score(lr, df[train_cols], df['SalePrice'], scoring="neg_mean_squared_error", cv=kf)
    return np.mean(np.sqrt(np.absolute(mses)))       

In [8]:
train_and_test(df)

25797.788135602976

End. Thankyou!