In [12]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

%matplotlib inline

In [64]:
import pickle

# Open the joined data

with open("../data/processed/0202_all_data.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)

df.head()

Unnamed: 0,energy_star_score,building_floor_area,parking_floor_area,total_ghg_emissions,site_eui,weather_normalized_site_eui,source_eui,weather_normalized_source_eui,water_use,property_id,USE1_DESC,EMV_TOTAL,TAX_CAPAC,TOTAL_TAX,FIN_SQ_FT,YEAR_BUILT,est_val_per_sqft
0,36,62400,0,971,115,118,251,252,708,2302924430394,Commercial,10115000.0,201550.0,404610.0,0,2012,162.099359
2,76,60640,62000,268,50,54,77,81,674,2202824340118,Commercial,22000.0,330.0,4073.0,0,1959,0.362797
3,89,60528,0,504,74,77,136,140,414,2402924240052,Cooperative,1542000.0,9252.0,16054.0,0,1960,25.475813
4,89,60528,0,504,74,77,136,140,414,2402924240176,Cooperative,55500.0,333.0,578.0,0,1962,0.916931
5,89,60528,0,504,74,77,136,140,414,2402924240177,Cooperative,55500.0,333.0,578.0,0,1962,0.916931


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390 entries, 0 to 743
Data columns (total 17 columns):
energy_star_score                390 non-null int64
building_floor_area              390 non-null int64
parking_floor_area               390 non-null int64
total_ghg_emissions              390 non-null int64
site_eui                         390 non-null int64
weather_normalized_site_eui      390 non-null int64
source_eui                       390 non-null int64
weather_normalized_source_eui    390 non-null int64
water_use                        390 non-null int64
property_id                      390 non-null int64
USE1_DESC                        390 non-null object
EMV_TOTAL                        390 non-null float64
TAX_CAPAC                        390 non-null float64
TOTAL_TAX                        390 non-null float64
FIN_SQ_FT                        390 non-null int64
YEAR_BUILT                       390 non-null int64
est_val_per_sqft                 390 non-null float64
dt

In [66]:
df.USE1_DESC.value_counts()

Condo Garage/Miscellaneous    194
Commercial                    159
Cooperative                    25
Other                           8
Industrial                      4
Name: USE1_DESC, dtype: int64

In [67]:
X=patsy.dmatrix('USE1_DESC',data=df,return_type='dataframe')
X.head()

Unnamed: 0,Intercept,USE1_DESC[T.Condo Garage/Miscellaneous],USE1_DESC[T.Cooperative],USE1_DESC[T.Industrial],USE1_DESC[T.Other]
0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0
5,1.0,0.0,1.0,0.0,0.0


In [68]:
# Append the X matrix to your df matrix

df2 = df.join(X)

## Cool thing to note: even though cars & X are different lengths (due to our previous .dropna()) ~ 
# joins allows us to 'merge' on their common index 
df2.head()

Unnamed: 0,energy_star_score,building_floor_area,parking_floor_area,total_ghg_emissions,site_eui,weather_normalized_site_eui,source_eui,weather_normalized_source_eui,water_use,property_id,...,TAX_CAPAC,TOTAL_TAX,FIN_SQ_FT,YEAR_BUILT,est_val_per_sqft,Intercept,USE1_DESC[T.Condo Garage/Miscellaneous],USE1_DESC[T.Cooperative],USE1_DESC[T.Industrial],USE1_DESC[T.Other]
0,36,62400,0,971,115,118,251,252,708,2302924430394,...,201550.0,404610.0,0,2012,162.099359,1.0,0.0,0.0,0.0,0.0
2,76,60640,62000,268,50,54,77,81,674,2202824340118,...,330.0,4073.0,0,1959,0.362797,1.0,0.0,0.0,0.0,0.0
3,89,60528,0,504,74,77,136,140,414,2402924240052,...,9252.0,16054.0,0,1960,25.475813,1.0,0.0,1.0,0.0,0.0
4,89,60528,0,504,74,77,136,140,414,2402924240176,...,333.0,578.0,0,1962,0.916931,1.0,0.0,1.0,0.0,0.0
5,89,60528,0,504,74,77,136,140,414,2402924240177,...,333.0,578.0,0,1962,0.916931,1.0,0.0,1.0,0.0,0.0


In [70]:
df2.drop('USE1_DESC', axis=1, inplace=True)

In [71]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390 entries, 0 to 743
Data columns (total 21 columns):
energy_star_score                          390 non-null int64
building_floor_area                        390 non-null int64
parking_floor_area                         390 non-null int64
total_ghg_emissions                        390 non-null int64
site_eui                                   390 non-null int64
weather_normalized_site_eui                390 non-null int64
source_eui                                 390 non-null int64
weather_normalized_source_eui              390 non-null int64
water_use                                  390 non-null int64
property_id                                390 non-null int64
EMV_TOTAL                                  390 non-null float64
TAX_CAPAC                                  390 non-null float64
TOTAL_TAX                                  390 non-null float64
FIN_SQ_FT                                  390 non-null int64
YEAR_BUILT           

In [74]:
with open('../data/processed/0202_all_data_wdummy.pkl', 'wb') as picklefile:
    pickle.dump(df2, picklefile)