In [None]:
import numpy as np
import pandas as pd
import tarfile
import urllib.request
from pathlib import Path


def load_housing_data():
    tarball_path = Path("numpyex/housing.tgz")
    if not tarball_path.is_file():
        Path("numpyex").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="numpyex")
    return pd.read_csv(Path("numpyex/housing/housing.csv"))
house=load_housing_data()
house.head()




In [None]:

import matplotlib.pyplot as plt
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
house.hist(bins=46,figsize=(12,10))
plt.show()










    


In [None]:
house["med_inc_cat"] = pd.cut(house["median_income"], bins=[0, 1, 3, 4, 6, np.inf], labels=[1, 2, 3, 4, 5])

In [None]:
house["med_inc_cat"].value_counts().sort_index().plot.bar(rot=0, figsize=(12,10),grid=True)
plt.title("Median Income")
plt.xlabel("income")
plt.ylabel("count")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
strat_train_set, strat_test_set = train_test_split(
house, test_size=0.2, stratify=house["med_inc_cat"], random_state=42)
strat_test_set["med_inc_cat"].value_counts() / len(strat_test_set)


    
    
    
    
    
    
    
    


In [None]:
house=strat_train_set.copy()
print(strat_train_set.head())


In [None]:
house.plot(kind="scatter",x="longitude",y="latitude",grid="True",alpha=0.2)
plt.show()

In [None]:
house.plot(kind="scatter",x="longitude",y="latitude",grid="True",s=(house["population"])/1000,c=house["median_house_value"],cmap="jet",colorbar=True,legend=True
,figsize=(12,10))
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
attributes=["median_house_value","population","median_income","total_rooms","latitude","longitude"]
scatter_matrix(house[attributes],figsize=(12,10))
                                               



In [None]:
house.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.2,grid=True)
plt.show()

In [None]:
house.plot(kind="scatter",x="housing_median_age",y="median_house_value",alpha=0.2,grid=True)
plt.show()

In [None]:
house.plot(kind="scatter",x="housing_median_age",y="population",alpha=0.2,grid=True)
plt.show()

In [None]:

house=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()

In [123]:
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder


from sklearn.compose import make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer



cat_pipeline=make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))






def ratio(x):
    return x[:,[0]]/x[:,[1]]
def ratio_name(feature_transformer,features_names_in):
    return ['ratio']

def ratio_pipeline():
    return make_pipeline(SimpleImputer(strategy="median"),FunctionTransformer(ratio,feature_names_out=ratio_name),StandardScaler())
log_pipeline=make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log,feature_names_out="one-to-one")
)
Cluster_Simil=KMeans(n_clusters=10,n_init=10,random_state=100)
default_num_pipeline=make_pipeline(SimpleImputer(strategy="median"),StandardScaler())
custom_trans=ColumnTransformer([
    ("pop_per_house",ratio_pipeline(),["population","households"]),
    ("rooms_per_house",ratio_pipeline(),["total_rooms","households"]),
    ("bedroom_per_room",ratio_pipeline(),["total_bedrooms","total_rooms"]),
    ("log",log_pipeline,["total_rooms","total_bedrooms","median_income","population"]),
    ("geo",Cluster_Simil,["longitude","latitude"]),
    ("cat",cat_pipeline,make_column_selector(dtype_include="object")),
    
    
    
    
],
   remainder=default_num_pipeline)
house_prepared=custom_trans.fit_transform(house)
print(house_prepared.shape)
    


    


        


(16512, 24)


In [None]:
from sklearn.linear_model import LinearRegression
lin_reg=make_pipeline(custom_trans,LinearRegression())



lin_reg.fit(house, housing_labels)

# Predict and print results
predictions = lin_reg.predict(house)
print(predictions[:5].round(-2))
print(housing_labels.iloc[:5].values)
from sklearn.metrics import mean_squared_error
lin_rmse=mean_squared_error(housing_labels,predictions)

print(lin_rmse**1/2)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline

tree_pipeline=make_pipeline(custom_trans,DecisionTreeRegressor(random_state=34))
tree_pipeline.fit(house,housing_labels)
predic_tree=tree_pipeline.predict(house)
print(predic_tree)
error=mean_squared_error(housing_labels,predic_tree)
print(error)




In [None]:
from sklearn.model_selection import cross_val_score
import pandas as pd
val_score=-cross_val_score(tree_pipeline,house,housing_labels,cv=10,scoring="neg_mean_squared_error")

print(pd.Series(val_score).describe())


In [None]:
from sklearn.ensemble import RandomForestRegressor
rtree_reg=make_pipeline(custom_trans,RandomForestRegressor(random_state=22))
rtree_reg.fit(house,housing_labels)
predict_rtree=rtree_reg.predict(house)
val_rscore=-cross_val_score(rtree_reg,house,housing_labels,cv=10,scoring="neg_mean_squared_error")
print(pd.Series(val_rscore).describe())
      
      

In [125]:

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline





full_pipe=Pipeline([
    ("custom_trans",ColumnTransformer(custom_trans)),
    ("random_forest",RandomForestRegressor(random_state=23)),
            
    
    
])

param_grid=[
    {"custom_trans__geo__n_clusters":[10,2,3],
     "random_forest__max_features":[10,2,4]},
    {"custom_trans__geo__n_clusters":[3,5,4],
     "random_forest__max_features":[8,3,2]},
]
grid_search=GridSearchCV(full_pipe,param_grid,scoring="neg_root_mean_squared_error",cv=3)
grid_search.fit(house,housing_labels)




print(grid_search.best_params_)
    
    
    
    



ValueError: Invalid parameter 'geo' for estimator ColumnTransformer(transformers=ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                                            SimpleImputer(strategy='median')),
                                                                           ('standardscaler',
                                                                            StandardScaler())]),
                                                 transformers=[('pop_per_house',
                                                                Pipeline(steps=[('simpleimputer',
                                                                                 SimpleImputer(strategy='median')),
                                                                                ('functiontransformer',
                                                                                 FunctionTransformer(feature_names_out=<function ratio_name at 0x0000...
                                                                 'total_bedrooms',
                                                                 'median_income',
                                                                 'population']),
                                                               ('geo',
                                                                KMeans(n_clusters=10,
                                                                       n_init=10,
                                                                       random_state=100),
                                                                ['longitude',
                                                                 'latitude']),
                                                               ('cat',
                                                                Pipeline(steps=[('simpleimputer',
                                                                                 SimpleImputer(strategy='most_frequent')),
                                                                                ('onehotencoder',
                                                                                 OneHotEncoder(handle_unknown='ignore'))]),
                                                                <sklearn.compose._column_transformer.make_column_selector object at 0x000001BA905E27B0>)])). Valid parameters are: ['force_int_remainder_cols', 'n_jobs', 'remainder', 'sparse_threshold', 'transformer_weights', 'transformers', 'verbose', 'verbose_feature_names_out'].