# Task for Today  

***

## NYC Property Price Prediction  

Given *data about property in New York City*, let's try to predict the **price** of a given piece of property.  
  
We will use XGBoost to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import r2_score

In [None]:
data = pd.read_csv('../input/nyc-property-sales/nyc-rolling-sales.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data['SALE PRICE'].unique()

In [None]:
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Remove any records where we don't have a sale price
    df['SALE PRICE'] = df['SALE PRICE'].replace(' -  ', np.NaN).astype(np.float)
    df = df.dropna(axis=0).reset_index(drop=True)
    
    # Remove unnecessary/difficult feature columns
    df = df.drop(['Unnamed: 0', 'BLOCK', 'LOT', 'EASE-MENT','ADDRESS', 'APARTMENT NUMBER'], axis=1)
    
    # Fill missing values with np.NaN
    df = df.replace(' -  ' , np.NaN)
    
    # Fill missing values with column means
    for column in ['LAND SQUARE FEET', 'GROSS SQUARE FEET']:
        df[column] = df[column].astype(np.float)
        df[column] = df[column].fillna(df[column].mean())
    
    # Get year, month, and day features from SALE DATE column
    df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
    
    df['YEAR'] = df['SALE DATE'].apply(lambda x: x.year)
    df['MONTH'] = df['SALE DATE'].apply(lambda x: x.month)
    df['DAY'] = df['SALE DATE'].apply(lambda x: x.day)
    
    df = df.drop('SALE DATE', axis=1)
    
    # Make numeric categorical features into string columns
    for column in ['BOROUGH', 'ZIP CODE']:
        df[column] = df[column].astype(str)
    
    # One-hot encode remaining categorical features
    df = onehot_encode(
        df,
        columns=[
            'BOROUGH', 'ZIP CODE', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
            'TAX CLASS AT PRESENT', 'BUILDING CLASS AT PRESENT', 'BUILDING CLASS AT TIME OF SALE'
        ],
        prefixes=['BO', 'ZC', 'NE', 'BC', 'TX', 'BP', 'BS']
    )
    
    # Split df into X and y
    y = df['SALE PRICE'].copy()
    X = df.drop('SALE PRICE', axis=1).copy()
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y

In [None]:
X, y = preprocess_inputs(data)

In [None]:
X

In [None]:
y

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
params = {'learning_rate': 0.001, 'max_depth': 6, 'lambda': 0.01}

model = xgb.train(params, dtrain, num_boost_round=10000, evals=[(dval, 'eval')], early_stopping_rounds=10)

In [None]:
y_true = np.array(y_test)
y_pred = model.predict(dtest)

In [None]:
print("Model R^2 Score: {:.4f}".format(r2_score(y_true, y_pred)))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/_T-tk_2b9pY