In [1]:
from tools import *
housing = load_housing_data()

ModuleNotFoundError: No module named 'tools'

In [None]:
import numpy as np

X = np.array([
[0, 1, 0, 1],
[1, 0, 1, 1],
[0, 1, 0, 1],
[1, 0, 1, 0],
])
y = np.array([0, 1, 0, 1])

print(X[:, ::2])

counts = {}

for label in np.unique(y):
    counts[label] = X[y == label].sum(axis=0)
print("{}".format(counts))



In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=.2, random_state=42)

In [None]:
import numpy as np
housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)
print(housing['median_income'] / 1.5)
housing['income_cat'].hist()

In [None]:
housing['income_cat'].where(housing['income_cat'] < 5, 5.0, inplace=True)
housing['income_cat'].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
housing['income_cat'].value_counts() / len(housing)


In [None]:
strat_train_set['income_cat'].value_counts() / len(strat_train_set)

In [None]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_test_set, strat_train_set):
    set_.drop('income_cat', axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=.1)

In [None]:
housing.plot(
    kind='scatter',
    x='longitude',
    y='latitude',
    alpha=.4,
    s=housing['population']/100,
    label='population',
    figsize=(15,10),
    c='median_house_value',
    cmap=plt.get_cmap('jet'),
    colorbar=True
)

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(18,12))

In [None]:
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=.3)

In [None]:
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [None]:
mediana = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(mediana, inplace=True)

In [None]:
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
# from future_encoders import OrdinalEncoder

# ordinal_encoder = OrdinalEncoder()
# housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

# ordinal_encoder.categories_

In [None]:
from future_encoders import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
housing_cat = housing[['ocean_proximity']]
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

print(housing_cat_1hot)
print(cat_encoder.categories_)

In [None]:
from transformer import CombinedAttributesAdder


attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
print(housing_extra_attribs)

In [None]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_extra_attribs.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr

In [None]:
from transformer import DataFrameSelector

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(sparse=False))
])

In [None]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

housing_prepared = full_pipeline.fit_transform(housing)

cat_pipeline.named_steps['cat_encoder'].categories_
add_attribs = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]
ocean_proximity_attribs = list(cat_pipeline.named_steps['cat_encoder'].categories_[0])

pd.DataFrame(housing_prepared, columns=num_attribs + add_attribs + ocean_proximity_attribs).head(10)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
print(lin_reg.coef_, lin_reg.intercept_)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
from sklearn.svm import SVR

# svr_reg = SVR()
# svr_reg.fit(housing_prepared, housing_labels)

In [None]:
from sklearn.model_selection import cross_val_score

def rmse_calcutale(model, data, labels, scoring="neg_mean_squared_error", cv=10):
    scores = cross_val_score(model, data, labels, scoring=scoring, cv=cv)
    return np.sqrt(-scores)  

In [None]:
# tree_rmse_scores = rmse_calcutale(tree_reg, housing_prepared, housing_labels)
lin_rmse_scores = rmse_calcutale(lin_reg, housing_prepared, housing_labels)
# forest_rmse_scores = rmse_calcutale(forest_reg, housing_prepared, housing_labels)

In [None]:
def display_scores(scores):
    print("Суммы оценок: ", scores)
    print("Среднее: ", scores.mean())
    print("Стандартное отклонение: ", scores.std())

In [None]:
print("Linear")
display_scores(lin_rmse_scores)
# print("")
# print("Tree")
# display_scores(tree_rmse_scores)
# print("")
# print("Forest")
# display_scores(forest_rmse_scores)



In [None]:
from sklearn.model_selection import GridSearchCV

params_grid = [
    {
        'n_estimators': [3, 10, 30],
        'max_features': [2, 4, 6, 8]
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10],
        'max_features': [2, 3, 4]
    }
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, params_grid, cv=5, scoring="neg_mean_squared_error")

grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]

cat_encoder
# cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
from transformer import TopFeatureSelector

top_feature_pipeline = Pipeline([
    ("preparation", full_pipeline),
    ("top_feature", TopFeatureSelector(feature_importances, 5))
])

housing_prepared_top_features = top_feature_pipeline.fit_transform(housing)
print(housing_prepared_top_features)
print(housing_prepared)

In [None]:
forest_reg_top_features = RandomForestRegressor()

params_grid = [
    {
        'n_estimators': [3, 10, 30]
    },
    {
        'bootstrap': [False],
        'n_estimators': [3, 10]
    }
]

grid_search = GridSearchCV(forest_reg_top_features, params_grid, cv=5, scoring="neg_mean_squared_error")

grid_search.fit(housing_prepared_top_features, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
# 48283.511966631399
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

# Full pipeline
X_test_prepared = full_pipeline.transform(X_test)

# Top features pipeline
X_test_prepared = top_feature_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

In [None]:
s = 50

# print("Метки:", list(y_test.iloc[:s]))

# print("lin_reg:", lin_reg.predict(housing_prepared)[:3])
# print("tree_reg:", tree_reg.predict(housing_prepared)[:3])
# print("forest_reg:", final_model.predict(X_test_prepared)[:3])
# for x in list(zip(list(y_test.iloc[:s]), final_model.predict(X_test_prepared)[:s])):
#     print(x)
