# Outline

## 1. Packages
- Packages pulled from various weeks for classifiers
### 1.1 Decision Trees
### 1.2 Linear Regression
### 1.3 Neural Nets
### 1.4 SVM

## 2. Import Data
- Currently only 2016 data
### 2.1 Assign parcelid to dataframe indicies and subset data
Contains three lines of code that can only be run once

## 3. Check the data
- Describe the data and check that everything makes sense
### 3.1 Data shape
### 3.2 Descriptive stats of train data
### 3.3 Variables types

## 4. Initial data prep
- First run of data prep, includes subsetting columns, shuffling and a first run training set
### 4.1 Test Subsetting (subsetting columns of the data)
### 4.2 Shuffle Data
### 4.4 First run testing data

## 5. Linear Regression
- Fit a linear regression
- Issues so far: 
    - not all data was int/float (see 4.1 for subsetting out non-int and float objects)
    - NaN values

# Considerations
- Should we add the transaction date from the train labels to the train data?

In [3]:
"""
1. Packages
All the packages we'll need for the various algorithms we can use
"""

# Global imports
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

np.random.seed(0)

"""
1.1 Decision Trees
Source: Week 4 Notebook


"""

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# For producing decision tree diagrams.
from IPython.core.display import Image, display
from sklearn.externals.six import StringIO
import pydot

"""
1.2 Linear Regression
Source: Week 6 Notebook


"""

import seaborn as sns
from numpy.linalg import inv
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

np.set_printoptions(precision=4, suppress=True)

"""
1.3 Neural Network
Source: Week 7 Notebook and Week 10 notebook

- Jason recommends we use Keras, but I have included the Week 7 imports as well


from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

import theano 
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
print(theano.config.device) # We're using CPUs (for now)
print(theano.config.floatX) # Should be 64 bit for CPUs

"""

import keras
keras.__version__

"""
1.4 SVM
Source: Week 10 Notebook

from sklearn.linear_model import LinearRegression
"""

from scipy import stats
import pylab as pl

sns.set()

Using TensorFlow backend.


In [5]:
"""
2. Import data
Import the Zillow Home Value Prediction data from the Kaggle Competition

Try to run this a few times as possible, the data set takes a little while to import
"""

# Import the data dictionary for a summary of the features we have
data_dictionary = pd.read_csv('..\Data\zillow_data_dictionary\zillow_data_dictionary.csv', sep=',')
print("Data dictionary done")

# Import the 2016 data
data_2016 = pd.read_csv('..\Data\properties_2016\properties_2016.csv', sep=',', engine='python')
print("Data 2016 done")

# Import 2016 labels
train_2016 = pd.read_csv('../Data/train_2016_v2/train_2016_v2.csv', sep=",", engine="python")
print("Train 2016 done")

# Check the shape
print('data shape:', data_2016.shape)

Data dictionary done
Data 2016 done
Train 2016 done
data shape: (2985217, 58)


In [21]:
"""
2.1 Assign parcelid to dataframe indicies and subset data
"""

# Only run these once
"""
data_2016 = data_2016.set_index('parcelid') # can only run once, function removes 'parcelid' from the df
train_data_2016 = data_2016.loc[train_2016.parcelid] # also only run once to avoid unnecesary processing
train_2016 = train_2016.set_index('parcelid') # also only run once
"""

print(data_2016.airconditioningtypeid.head(5))
print(train_2016.head(5))
print(train_data_2016.airconditioningtypeid.head(5))

parcelid
10754147   NaN
10759547   NaN
10843547   NaN
10859147   NaN
10879947   NaN
Name: airconditioningtypeid, dtype: float64
          logerror transactiondate
parcelid                          
11016594    0.0276      2016-01-01
14366692   -0.1684      2016-01-01
12098116   -0.0040      2016-01-01
12643413    0.0218      2016-01-02
14432541   -0.0050      2016-01-02
parcelid
11016594    1.0
14366692    NaN
12098116    1.0
12643413    1.0
14432541    NaN
Name: airconditioningtypeid, dtype: float64


In [13]:
# 3. Check the data

# 3.1 Data shape
# Check the shape of the train data
print("Train data shape:", train_2016.shape, "Subsetted train data shape:", train_data_2016.shape)

# 3.2 Descriptive stats of train data
# Descriptive statistics of train data
train_data_2016.describe()

Train data shape: (90275, 3) Subsetted train data shape: (90275, 57)


Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,yardbuildingsqft26,yearbuilt,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock
count,28781.0,261.0,43.0,90275.0,90275.0,16.0,57364.0,89093.0,658.0,6856.0,...,95.0,89519.0,20570.0,89895.0,90274.0,90275.0,90274.0,90269.0,1783.0,89670.0
mean,1.816372,7.229885,713.581395,2.279474,3.031869,4.0,5.565407,2.309216,66.0,1347.974037,...,311.694737,1968.53287,1.440739,180093.4,457672.6,2015.0,278335.3,5983.975927,13.402692,60491510000000.0
std,2.974168,2.716196,437.434198,1.004271,1.156436,0.0,1.900602,0.976172,0.0,652.399026,...,346.35485,23.763475,0.544498,209129.9,554884.4,0.0,400495.5,6838.876956,2.715966,204660500000.0
min,1.0,2.0,100.0,0.0,0.0,4.0,1.0,1.0,66.0,44.0,...,18.0,1885.0,1.0,100.0,22.0,2015.0,22.0,49.08,6.0,60371010000000.0
25%,1.0,7.0,407.5,2.0,2.0,4.0,4.0,2.0,66.0,938.0,...,100.0,1953.0,1.0,81245.0,199023.2,2015.0,82228.0,2872.83,13.0,60373200000000.0
50%,1.0,7.0,616.0,2.0,3.0,4.0,7.0,2.0,66.0,1244.0,...,159.0,1970.0,1.0,132000.0,342872.0,2015.0,192970.0,4542.75,14.0,60376150000000.0
75%,1.0,7.0,872.0,3.0,4.0,4.0,7.0,3.0,66.0,1614.0,...,361.0,1987.0,2.0,210534.5,540589.0,2015.0,345419.5,6901.09,15.0,60590420000000.0
max,13.0,21.0,1555.0,20.0,16.0,4.0,12.0,20.0,66.0,7625.0,...,1366.0,2015.0,4.0,9948100.0,27750000.0,2015.0,24500000.0,321936.09,99.0,61110090000000.0


In [35]:
# 3.3 Variables types
# Print the type of each variable

print("ALL DATA TYPES")
print(train_data_2016.dtypes)

ALL DATA TYPES
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
fireplacecnt                    float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64
hashottuborspa                   object
heatingorsystemtypeid           float64
latitude                        float64
longitude                

In [36]:
# 4. Prep the data

# 4.1 Test Subsetting
# Testing - keep only int or float columns, to just initially test algorithms before doing any additional data manipulation
temp_data = train_data_2016.select_dtypes(['float64','int64'])

# Check Variable types
print("\nINT AND FLOAT ONLY DATA")
print(temp_data.dtypes)

In [None]:
# 4.2 Shuffle Data
# Shuffle the data
# Shuffling the subsetted 2016 training data
np_data_2016, np_train_2016 = np.asarray(temp_data), np.asarray(train_2016['logerror'])
shuffle = np.random.permutation(np.arange(np_data_2016.shape[0]))
np_data_2016, np_train_2016 = np_data_2016[shuffle], np_train_2016[shuffle]

In [42]:
# 4.4 First run testing data
mini_train_data_2016, mini_train_labels_2016 = np_data_2016[:10000], np_train_2016[:10000]

In [43]:
# 5. Linear Regression
linr =  LinearRegression()
linr.fit(mini_train_data_2016, mini_train_labels_2016)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').