In [23]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline


In [24]:


from sklearn.base import TransformerMixin

class CategoricalTransformer(TransformerMixin):
    "Converts a set of columns in a DataFrame to categoricals"
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        'Records the categorical information'
        self.cat_map_ = {col: X[col].astype('category').cat
                         for col in self.columns}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col in self.columns:
            X[col] = pd.Categorical(X[col],
            categories=self.cat_map_[col].categories,
            ordered=self.cat_map_[col].ordered)
        return X

    def inverse_transform(self, trn, y=None):
        trn = trn.copy()
        trn[self.columns] = trn[self.columns].apply(lambda x: x.astype(object))
        return trn
    
class DummyEncoder(TransformerMixin):

    def __init__(self, columns= None, drop_first=False):
        self.columns = columns
        self.drop_first = drop_first

        self.columns_ = None
        self.cat_columns_ = None  # type: pd.Index
        self.non_cat_columns_ = None  # type: pd.Index
        self.categories_map_ = None
        self.ordered_map_ = None
        self.cat_blocks_ = None

    def fit(self, X, y=None):
        self.columns_ = X.columns
        if self.columns is None:
            self.cat_columns_ = X.select_dtypes(include=['category']).columns
        else:
            self.cat_columns_ = self.columns
        self.non_cat_columns_ = X.columns.drop(self.cat_columns_)

        self.categories_map_ = {col: X[col].cat.categories
                                for col in self.cat_columns_}
        self.ordered_map_ = {col: X[col].cat.ordered
                             for col in self.cat_columns_}

        left = len(self.non_cat_columns_)
        self.cat_blocks_ = {}
        for col in self.cat_columns_:
            right = left + len(X[col].cat.categories)
            self.cat_blocks_[col] = slice(left, right)
            left = right
        return self

    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            return pd.get_dummies(X, drop_first=self.drop_first)
        elif isinstance(X, dd.DataFrame):
            return X.map_partitions(pd.get_dummies, drop_first=self.drop_first)
        else:
            raise TypeError

    def inverse_transform(self, X):
        print len(self.non_cat_columns_)
        non_cat = pd.DataFrame(X.iloc[:,:len(self.non_cat_columns_)], columns=self.non_cat_columns_)
        cats = []
        for col in self.cat_columns_:
            slice_ = self.cat_blocks_[col]
            categories = self.categories_map_[col]
            ordered = self.ordered_map_[col]
            print slice

            codes = X.iloc[:, slice_].as_matrix().argmax(1)
            series = pd.Series(pd.Categorical.from_codes(
                codes, categories, ordered=ordered
            ), name=col)
            cats.append(series)
        df = pd.concat([non_cat] + cats, axis=1)[self.columns_]
        return df


In [25]:
kcdata = pd.read_csv("dataset/kc_house_data_10.csv")
kcdata

Unnamed: 0,price,sqft_living,zipcode
0,221900.0,1180,98178
1,538000.0,2570,98178
2,180000.0,770,98178
3,604000.0,1960,98136
4,510000.0,1680,98136
5,1230000.0,5420,98136
6,257500.0,1715,98136
7,291850.0,1060,98198
8,229500.0,1780,98198
9,257500.0,1780,98198


In [26]:
y = kcdata['price']
X = kcdata.drop('price', axis=1)


In [27]:
columns = ['zipcode']
pipe = make_pipeline(CategoricalTransformer(columns), DummyEncoder())
pipe.fit(X, y)
t = pipe.transform(X)
t.head(5)


Unnamed: 0,sqft_living,zipcode_98136,zipcode_98178,zipcode_98198
0,1180,0,1,0
1,2570,0,1,0
2,770,0,1,0
3,1960,1,0,0
4,1680,1,0,0


In [28]:
tt = pipe.inverse_transform(t)
tt.head(5)

1
<type 'slice'>


Unnamed: 0,sqft_living,zipcode
0,1180,98178
1,2570,98178
2,770,98178
3,1960,98136
4,1680,98136


In [29]:
pipe.transform(X.head(1))

Unnamed: 0,sqft_living,zipcode_98136,zipcode_98178,zipcode_98198
0,1180,0,1,0


In [30]:
kcdata2 = pd.read_csv("dataset/kc_house_data_2.csv")
kcdata2


Unnamed: 0,price,sqft_living,zipcode
0,221900.0,1180,99999
1,538000.0,2570,99999
2,180000.0,770,99999
3,604000.0,1960,98136
4,510000.0,1680,98136
5,1230000.0,5420,98136
6,257500.0,1715,98136
7,291850.0,1060,98198
8,229500.0,1780,98198
9,257500.0,1780,98198


In [31]:
print len(kcdata2)
y = kcdata2['price']
X = kcdata2.drop('price', axis=1)
res = pipe.transform(X)
res


11


Unnamed: 0,sqft_living,zipcode_98136,zipcode_98178,zipcode_98198
0,1180,0,0,0
1,2570,0,0,0
2,770,0,0,0
3,1960,1,0,0
4,1680,1,0,0
5,5420,1,0,0
6,1715,1,0,0
7,1060,0,0,1
8,1780,0,0,1
9,1780,0,0,1


In [37]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('dataset/kc_house_data.csv', dtype= dtype_dict)
sales = sales.sort_values(by=['sqft_living', 'price'])
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
19452,3980300371,20140926T000000,142000.0,0.0,0.0,290.0,20875,1,0,0,...,1,290,0,1963,0,98024,47.5308,-121.888,1620.0,22850.0
15381,2856101479,20140701T000000,276000.0,1.0,0.75,370.0,1801,1,0,0,...,5,370,0,1923,0,98117,47.6778,-122.389,1340.0,5000.0
860,1723049033,20140620T000000,245000.0,1.0,0.75,380.0,15000,1,0,0,...,5,380,0,1963,0,98168,47.481,-122.323,1170.0,15000.0
18379,1222029077,20141029T000000,265000.0,0.0,0.75,384.0,213444,1,0,0,...,4,384,0,2003,0,98070,47.4177,-122.491,1920.0,224341.0
4868,6896300380,20141002T000000,228000.0,0.0,1.0,390.0,5900,1,0,0,...,4,390,0,1953,0,98118,47.526,-122.261,2170.0,6000.0


In [38]:
y = sales['price']
X = sales[["sqft_living", "zipcode"]]
X.head(5)

Unnamed: 0,sqft_living,zipcode
19452,290.0,98024
15381,370.0,98117
860,380.0,98168
18379,384.0,98070
4868,390.0,98118


In [43]:
pipe = make_pipeline(CategoricalTransformer(columns), DummyEncoder(), Lasso())

In [44]:
pipe.fit(X,y)



Pipeline(memory=None,
     steps=[('categoricaltransformer', <__main__.CategoricalTransformer object at 0x000000000B50DA20>), ('dummyencoder', <__main__.DummyEncoder object at 0x000000000B50D9E8>), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [45]:
pipe.score(X,y)

0.7300863219888627