In [1]:
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import os
%matplotlib inline 

In [2]:
HOUSING_PATH = os.path.join("zestaw danych","mieszkania")
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [3]:
df = load_housing_data()

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.3, random_state = 42)

In [10]:
housing = train_set.drop('median_house_value', axis=1)
housing_labels = train_set['median_house_value'].copy()
# pomocnicze ramki
housing_num = housing.select_dtypes(include=[np.number])
housing_cat = housing.select_dtypes(include=[np.object])

In [12]:
housing_cat.head()

Unnamed: 0,ocean_proximity
7061,<1H OCEAN
14689,NEAR OCEAN
17323,NEAR OCEAN
10056,INLAND
15750,NEAR BAY


### Niestandardowe transformacje 

np. czyszczenie danych lub łączenie atrybutów

Transformacje możesz wykonywać bezpośrednio na danych ale ... 

Możesz też napisać je zgodnie z sklearn. Aby to uczynić stwórz klasę z trzema metodami: 

`fit()` zwracająca `self`

`transform()` 

`fit_transform()`




In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6 

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        Pokoje_na_rodzine = X[:, rooms_ix]/X[:, household_ix]
        Populacja_na_rodzine = X[:, population_ix] / X[:, household_ix] 
        if self.add_bedrooms_per_room:
            Sypialnie_na_pokoje = X[:, bedrooms_ix] / X[:, rooms_ix] 
            return np.c_[X, Pokoje_na_rodzine, Populacja_na_rodzine, Sypialnie_na_pokoje]
        else:
            return np.c_[X, Pokoje_na_rodzine, Populacja_na_rodzine]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing_extra_attribs

### W jaki sposób skalują dane numeryczne metody MinMaxScaler oraz StandardScaler 

### Potoki transformacji

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

housing_num_tr_pip = num_pipeline.fit_transform(housing_num)

housing_num_tr_pip



array([[ 0.78093406, -0.80568191,  0.50935748, ...,  0.18106017,
        -0.01082519, -0.80919934],
       [ 1.24526986, -1.33947268, -0.67987313, ..., -0.42262953,
        -0.08931585,  0.5409245 ],
       [-0.27755183, -0.49664515, -0.36274497, ...,  0.07312833,
        -0.04480037, -0.63257554],
       ...,
       [ 0.60119118, -0.75885816,  0.58863952, ..., -0.59156984,
         0.01720102,  0.99001519],
       [-1.18625198,  0.90338501, -1.07628333, ...,  0.39014889,
         0.00482125, -0.78932602],
       [-1.41592345,  0.99235014,  1.85715216, ..., -0.82965604,
        -0.0816717 ,  1.68141644]])

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names 
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [16]:
from sklearn.preprocessing import LabelBinarizer
num_attribs = list(housing_num) 
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)), 
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LabelBinarizer()),
])



In [17]:
num_pipeline

Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'])), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)), ('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])

In [18]:
cat_pipeline

Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(attribute_names=['ocean_proximity'])), ('cat_encoder', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False))])

In [19]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[ 
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [20]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [21]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        self.enc = LabelBinarizer(sparse_output=self.sparse_output)
        self.enc.fit(X)
        return self
    def transform(self, X, y=None):
        return self.enc.transform(X)

In [22]:
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)), 
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', CustomLabelBinarizer()),
])



In [23]:
full_pipeline = FeatureUnion(transformer_list=[ 
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [24]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape

(14448, 16)

## MODEL

In [25]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [26]:
some_data = housing.iloc[:5]
some_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
7061,-118.02,33.93,35.0,2400.0,398.0,1218.0,408.0,4.1312,<1H OCEAN
14689,-117.09,32.79,20.0,2183.0,534.0,999.0,496.0,2.8631,NEAR OCEAN
17323,-120.14,34.59,24.0,1601.0,282.0,731.0,285.0,4.2026,NEAR OCEAN
10056,-121.0,39.26,14.0,810.0,151.0,302.0,138.0,3.1094,INLAND
15750,-122.45,37.77,52.0,3188.0,708.0,1526.0,664.0,3.3068,NEAR BAY


In [27]:
some_labels = housing_labels.iloc[:5]
some_labels

7061     193800.0
14689    169700.0
17323    259800.0
10056    136100.0
15750    500001.0
Name: median_house_value, dtype: float64

In [28]:
some_data_prepared = full_pipeline.transform(some_data)

In [29]:
print("Prognozy:", lin_reg.predict(some_data_prepared))

Prognozy: [221608.71694073 196160.20621852 261964.70699839  70926.09270578
 256891.00870896]


In [30]:
print("Rzeczywistość:", list(some_labels))

Rzeczywistość: [193800.0, 169700.0, 259800.0, 136100.0, 500001.0]


In [31]:
from sklearn.metrics import mean_squared_error

In [32]:
housing_predictions = lin_reg.predict(housing_prepared)

In [33]:
lin_mse = mean_squared_error(housing_labels, housing_predictions)

In [34]:
lin_rmse = np.sqrt(lin_mse)

In [35]:
lin_rmse

67881.61208813675

## A inny model ? 

In [36]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [37]:
housing_predictions = tree_reg.predict(housing_prepared)

In [38]:
tree_mse = mean_squared_error(housing_labels, housing_predictions)

In [39]:
tree_rmse = np.sqrt(tree_mse)

In [40]:
tree_rmse

0.0

In [41]:
from sklearn.model_selection import cross_val_score

In [42]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)

In [43]:
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores

array([74272.35182664, 71147.67067251, 72238.45588147, 74194.89450467,
       69650.85895072, 71777.82226548, 68047.64642193, 69556.10837261,
       69189.19153987, 70638.81850328])

In [44]:
def wyniki(scores):
    print("Wyniki", scores)
    print("Średnio", scores.mean())
    print("Odchylenie std", scores.std())
    

In [45]:
wyniki(tree_rmse_scores)

Wyniki [74272.35182664 71147.67067251 72238.45588147 74194.89450467
 69650.85895072 71777.82226548 68047.64642193 69556.10837261
 69189.19153987 70638.81850328]
Średnio 71071.38189391863
Odchylenie std 1976.2627938964577


In [46]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)

In [47]:
lin_rmse_scores = np.sqrt(-lin_scores)

In [48]:
wyniki(lin_rmse_scores)

Wyniki [70880.18689495 67409.11219521 66718.50669456 66644.24144927
 67870.5024128  65300.94909237 65376.95506465 68531.90978125
 74995.3725312  68145.24576002]
Średnio 68187.29818762788
Odchylenie std 2742.1779032896425


## Sprawdz model lasów losowych

In [49]:
from sklearn.ensemble import RandomForestRegressor

## Zapisz model 

In [None]:
from sklearn.externals import joblib
# zapisz
joblib.dump(my_model, "mój_model.pkl")
# zaladuj
my_model_loaded = joblib.load("mój_model.pkl")


## Regulacja modelu

In [50]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[2,10,30], 'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators':[2,30], 'max_features':[2,3,4] }
]

In [51]:
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared,housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [2, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [2, 30], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [52]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 4, 'n_estimators': 30}

In [53]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [54]:
grid_search.cv_results_



{'mean_fit_time': array([0.0407887 , 0.18483953, 0.5677866 , 0.06170063, 0.30105782,
        0.89692249, 0.08468666, 0.42075229, 1.25261974, 0.10929818,
        0.53992395, 1.61706657, 0.06133695, 0.88517041, 0.0803791 ,
        1.17366314, 0.09969711, 1.47763076]),
 'std_fit_time': array([0.00399102, 0.00175882, 0.02160364, 0.0009068 , 0.00458634,
        0.00482843, 0.00188308, 0.00247763, 0.01004914, 0.00290248,
        0.00542832, 0.01629006, 0.00147333, 0.01587487, 0.002213  ,
        0.01225558, 0.00221822, 0.00627976]),
 'mean_score_time': array([0.002738  , 0.00883498, 0.02401724, 0.00261669, 0.00886836,
        0.0234705 , 0.00265536, 0.00871949, 0.02362342, 0.00237193,
        0.00870605, 0.02428913, 0.00275097, 0.02842999, 0.00262904,
        0.02768245, 0.00273423, 0.02800117]),
 'std_score_time': array([5.95106165e-04, 2.49263339e-04, 1.11103703e-03, 7.60283018e-05,
        1.60507779e-04, 1.37701672e-04, 6.10322877e-05, 3.30663978e-04,
        1.14680615e-04, 3.91295954e-

In [55]:
grid_search.best_estimator_.feature_importances_

array([8.91669976e-02, 7.88418852e-02, 4.05563008e-02, 2.01828103e-02,
       1.92590494e-02, 1.98614345e-02, 1.86428348e-02, 2.78774608e-01,
       4.98816349e-02, 1.06140351e-01, 1.14732521e-01, 2.12136284e-02,
       1.32189423e-01, 1.95286580e-04, 5.04312082e-03, 5.31811417e-03])