# Task for Today  

***

## Perth House Price Prediction  

Given *data about houses in Perth*, let's try to predict the **price** of a given house.

We will use three different linear regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/perth-house-prices/all_perth_310121.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop high-cardinality ADDRESS column
    df = df.drop('ADDRESS', axis=1)
    
    # Drop high-missing value (> 25%) column
    df = df.drop('NEAREST_SCH_RANK', axis=1)
    
    # Fill missing values
    df['BUILD_YEAR'] = df['BUILD_YEAR'].fillna(df['BUILD_YEAR'].median())
    
    # Extract date features
    df['DATE_SOLD'] = pd.to_datetime(df['DATE_SOLD'])
    df['DATE_YEAR'] = df['DATE_SOLD'].apply(lambda x: x.year)
    df['DATE_MONTH'] = df['DATE_SOLD'].apply(lambda x: x.month)
    df = df.drop('DATE_SOLD', axis=1)
    
    # One-hot encode nominal features
    for column in ['SUBURB', 'NEAREST_STN', 'NEAREST_SCH', 'POSTCODE', 'GARAGE']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['PRICE']
    X = df.drop('PRICE', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
models = {
    "                Linear Regression": LinearRegression(),
    "Ridge (L2-Regularized) Regression": Ridge(),
    "Lasso (L1-Regularized) Regression": Lasso()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
for name, model in models.items():
    print(name + ": R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

In [None]:
ridge_model = Ridge(alpha=10.0)
ridge_model.fit(X_train, y_train)

print("R^2 Score: {:.5f}".format(ridge_model.score(X_test, y_test)))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/R89DgPzHDIk