In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv("housing.csv")
data['rooms_per_household'] = data['total_rooms']/data['households']
data['population_per_household'] = data['population']/data['households']
data['bedrooms_per_rooms'] = data['total_bedrooms']/data['total_rooms']

In [4]:
data.drop(['latitude', 'longitude'], axis=1, inplace=True)

In [6]:
features = data.drop("median_house_value", axis=1)
target = data['median_house_value']

In [11]:
list(features.columns[:6]) + list(features.columns[7:])

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'population_per_household',
 'bedrooms_per_rooms']

In [12]:
len(features.columns)

10

In [14]:
features.columns[6]

'ocean_proximity'

In [7]:
num_attr = np.array(list(features.columns[:6]) + list(features.columns[7:]))
cat_attr = [features.columns[6]]

In [8]:
num_pipe = Pipeline([
    ('simple', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
pipe = ColumnTransformer([
    ('num', num_pipe, num_attr),
    ('cat', OneHotEncoder(), cat_attr)  # ['ocean']
])

In [56]:
num_attr

array(['housing_median_age', 'total_rooms', 'total_bedrooms',
       'population', 'households', 'median_income', 'rooms_per_household',
       'population_per_household', 'bedrooms_per_rooms'], dtype='<U24')

In [57]:
num_pipe.fit_transform(features.loc[:, num_attr])

array([[ 0.98214266, -0.8048191 , -0.97247648, ...,  0.62855945,
        -0.04959654, -1.14993031],
       [-0.60701891,  2.0458901 ,  1.35714343, ...,  0.32704136,
        -0.09251223, -0.99038135],
       [ 1.85618152, -0.53574589, -0.82702426, ...,  1.15562047,
        -0.02584253, -1.44586501],
       ...,
       [-0.92485123, -0.17499526, -0.12360781, ..., -0.09031802,
        -0.0717345 ,  0.03870567],
       [-0.84539315, -0.35559977, -0.30482697, ..., -0.04021111,
        -0.09122515,  0.12050112],
       [-1.00430931,  0.06840827,  0.18875678, ..., -0.07044252,
        -0.04368215,  0.14290124]])

In [58]:
features_trans = pipe.fit_transform(features)

In [59]:
features_trans.shape

(20640, 14)

In [60]:
features.shape

(20640, 10)

In [9]:
pipe = ColumnTransformer([
    ('num', num_pipe, ['housing_median_age', 'total_rooms', 'total_bedrooms',
       'population', 'households', 'median_income', 'rooms_per_household',
       'population_per_household', 'bedrooms_per_rooms']),
    ('cat', OneHotEncoder(), ['ocean_proximity'])
])
final_feat = pipe.fit_transform(features)

X_train, X_test, Y_train, Y_test = train_test_split(final_feat, target, random_state=100)


model = LinearRegression()
model.fit(X_train, Y_train)
pred = model.predict(X_test)
mean_absolute_error(Y_test, pred)


48850.31800601671

In [81]:
target.max() - target.min()

485002.0

In [82]:
r2_score(Y_test, pred)

0.6590061766540446

In [71]:
X_train.shape

(15480, 14)

In [76]:
X_train[:5]

array([[-0.84539315,  0.41815761, -0.1498369 ,  0.07110631, -0.13219192,
         1.60773138,  0.99955196,  0.02729353, -1.37514572,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.34647803, -0.33863945, -0.3787453 , -0.47462052, -0.29959074,
        -0.92619934, -0.20279065, -0.07357756, -0.23705756,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ],
       [ 1.85618152, -0.40648073, -0.27382895, -0.51965623, -0.2838971 ,
        -0.8136071 , -0.38634205, -0.08954441,  0.49119329,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ],
       [-0.2891866 , -0.45782008, -0.3405939 , -0.68567022, -0.31528438,
         0.59703311, -0.44853992, -0.13077995,  0.48085408,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.10810379, -0.17178655, -0.22375524, -0.43841534, -0.30482196,
         0.2176199 ,  0.19174325, -0.06211056, -0.29478884,  0.        ,
         0.        ,  0.  

In [83]:
model

LinearRegression()

In [20]:
import pickle
with open("linmodel.pkl", 'wb') as fp:
    pickle.dump(model, fp)

In [16]:
import pickle
with open("pipe.pkl", 'wb') as fp:
    pickle.dump(pipe, fp)

In [11]:
with open("linmodel.pkl", 'rb') as fp:
    print(fp.read())

b'\x80\x04\x95\xa0\x02\x00\x00\x00\x00\x00\x00\x8c\x1asklearn.linear_model._base\x94\x8c\x10LinearRegression\x94\x93\x94)\x81\x94}\x94(\x8c\rfit_intercept\x94\x88\x8c\tnormalize\x94\x89\x8c\x06copy_X\x94\x88\x8c\x06n_jobs\x94N\x8c\x08positive\x94\x89\x8c\x0en_features_in_\x94K\x0e\x8c\x05coef_\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x0e\x85\x94h\x0f\x8c\x05dtype\x94\x93\x94\x8c\x02f8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89Cp_5\x04\x8b\xaa;\xcd@v\xe0\xe0\xd9\xd2\xec\xb0@\xf5>-\r\x7f\xcd\x9e\xc0Sj\r\x176.\xe5\xc0FM\xdc\xffq\xd6\xe6@DV\x87#\x7f\x96\xf3@\x01x\x16X\x10"\xb3@\x969\xe2\x91\x01\xf6\x80\xc0F\\M\x8d\x1d.\xce@\xcf\xeb\xcb%h\xdb\xd9\xc0\xd9LN\x02O\xea\xf5\xc0\xdcT\xcc}\x99\x16\x02A\xb6\xa33~\xc2l\xd3\xc0&\xc8S\x82\xca\x86\xc7\xc0\x94t\x94b\x8c\t_residues\x94h\x0eh\x11K\x00\x85\x94h\x13\x87\x94R\x94(K\x01K\x00\x8

In [21]:
with open("linmodel.pkl", 'rb') as fp:
    print(pickle.load(fp))

LinearRegression()


In [88]:
import joblib

In [89]:
print(dir(joblib))

['Logger', 'MemorizedResult', 'Memory', 'Parallel', 'PrintTime', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_deprecated_my_exceptions', '_memmapping_reducer', '_multiprocessing_helpers', '_parallel_backends', '_store_backends', 'backports', 'compressor', 'cpu_count', 'delayed', 'disk', 'dump', 'effective_n_jobs', 'executor', 'externals', 'func_inspect', 'hash', 'hashing', 'load', 'logger', 'memory', 'my_exceptions', 'numpy_pickle', 'numpy_pickle_compat', 'numpy_pickle_utils', 'os', 'parallel', 'parallel_backend', 'pool', 'register_compressor', 'register_parallel_backend', 'register_store_backend', 'wrap_non_picklable_objects']


In [90]:
help(joblib)

Help on package joblib:

NAME
    joblib

DESCRIPTION
    Joblib is a set of tools to provide **lightweight pipelining in
    Python**. In particular:
    
    1. transparent disk-caching of functions and lazy re-evaluation
       (memoize pattern)
    
    2. easy simple parallel computing
    
    Joblib is optimized to be **fast** and **robust** on large
    data in particular and has specific optimizations for `numpy` arrays. It is
    **BSD-licensed**.
    
    
        **Documentation:**       https://joblib.readthedocs.io
    
        **Download:**            https://pypi.python.org/pypi/joblib#downloads
    
        **Source code:**         https://github.com/joblib/joblib
    
        **Report issues:**       https://github.com/joblib/joblib/issues
    
    
    Vision
    --------
    
    The vision is to provide tools to easily achieve better performance and
    reproducibility when working with long running jobs.
    
     *  **Avoid computing the same thing twice**: code 

In [91]:
with open("linmodel.pkl", 'wb') as fp:
    joblib.dump(model, fp)

In [92]:
with open("linmodel.pkl", 'rb') as fp:
    print(fp.read())

b'\x80\x04\x95<\x01\x00\x00\x00\x00\x00\x00\x8c\x1asklearn.linear_model._base\x94\x8c\x10LinearRegression\x94\x93\x94)\x81\x94}\x94(\x8c\rfit_intercept\x94\x88\x8c\tnormalize\x94\x89\x8c\x06copy_X\x94\x88\x8c\x06n_jobs\x94N\x8c\x08positive\x94\x89\x8c\x0en_features_in_\x94K\x0e\x8c\x05coef_\x94\x8c\x13joblib.numpy_pickle\x94\x8c\x11NumpyArrayWrapper\x94\x93\x94)\x81\x94}\x94(\x8c\x08subclass\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94\x8c\x05shape\x94K\x0e\x85\x94\x8c\x05order\x94\x8c\x01C\x94\x8c\x05dtype\x94h\x12h\x19\x93\x94\x8c\x02f8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x8c\nallow_mmap\x94\x88ub_5\x04\x8b\xaa;\xcd@v\xe0\xe0\xd9\xd2\xec\xb0@\xf5>-\r\x7f\xcd\x9e\xc0Sj\r\x176.\xe5\xc0FM\xdc\xffq\xd6\xe6@DV\x87#\x7f\x96\xf3@\x01x\x16X\x10"\xb3@\x969\xe2\x91\x01\xf6\x80\xc0F\\M\x8d\x1d.\xce@\xcf\xeb\xcb%h\xdb\xd9\xc0\xd9LN\x02O\xea\xf5\xc0\xdcT\xcc}\x99\x16\x02A\xb6\xa33~\xc2l\xd3\xc0&\xc8S\x82\xca\x86\xc7\xc0\x95+\x00\x00\x00\x

In [93]:
with open("linmodel.pkl", 'rb') as fp:
    print(joblib.load(fp))

LinearRegression()


In [15]:
with open("pipe.pkl", 'wb') as fp:
    joblib.dump(pipe, fp)

NameError: name 'joblib' is not defined

In [14]:
pipe = ColumnTransformer([
    ('num', num_pipe, ['housing_median_age', 'total_rooms', 'total_bedrooms',
       'population', 'households', 'median_income', 'rooms_per_household',
       'population_per_household', 'bedrooms_per_rooms']),
    ('cat', OneHotEncoder(), ['ocean_proximity'])
])

X_train, X_test, Y_train, Y_test = train_test_split(features, target, random_state=100)

X_train = pipe.fit_transform(X_train)

model = LinearRegression()
model.fit(X_train, Y_train)

X_test = pipe.transform(X_test)
pred = model.predict(X_test)
mean_absolute_error(Y_test, pred)

48850.20770609444

In [19]:
data.iloc[0]

housing_median_age              41.0
total_rooms                    880.0
total_bedrooms                 129.0
population                     322.0
households                     126.0
median_income                 8.3252
median_house_value          452600.0
ocean_proximity             NEAR BAY
rooms_per_household         6.984127
population_per_household    2.555556
bedrooms_per_rooms          0.146591
Name: 0, dtype: object

In [22]:
data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [23]:
features[:5]

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_rooms
0,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,6.984127,2.555556,0.146591
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,6.238137,2.109842,0.155797
2,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,8.288136,2.80226,0.129516
3,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,5.817352,2.547945,0.184458
4,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,6.281853,2.181467,0.172096


In [28]:
values = [[42, 130, 130, 400, 130, 9, 'NEAR BAY', 1.0, 3.076923076923077, 1.0]]
df = pd.DataFrame(values, columns=features.columns)

In [30]:
model.predict(pipe.transform(df))

array([656673.41618051])

In [31]:
target

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

In [32]:
df

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household,bedrooms_per_rooms
0,42,130,130,400,130,9,NEAR BAY,1.0,3.076923,1.0
