In [None]:
import numpy as np
import pandas as pd
import tarfile
import urllib.request
from pathlib import Path


def load_housing_data():
    tarball_path = Path("numpyex/housing.tgz")
    if not tarball_path.is_file():
        Path("numpyex").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="numpyex")
    return pd.read_csv(Path("numpyex/housing/housing.csv"))
house=load_housing_data()
house.head()




In [None]:
house.info()

In [None]:

import matplotlib.pyplot as plt
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
house.hist(bins=46,figsize=(12,10))
plt.show()










    


In [None]:

from sklearn.model_selection import train_test_split
train, test = train_test_split(house, test_size=0.2)
train.head()

In [None]:
house["med_inc_cat"] = pd.cut(house["median_income"], bins=[0, 1, 3, 4, 6, np.inf], labels=[1, 2, 3, 4, 5])

In [None]:
house["med_inc_cat"].value_counts().sort_index().plot.bar(rot=0, figsize=(12,10),grid=True)
plt.title("Median Income")
plt.xlabel("income")
plt.ylabel("count")
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
strat_train_set, strat_test_set = train_test_split(
house, test_size=0.2, stratify=house["med_inc_cat"], random_state=42)
strat_test_set["med_inc_cat"].value_counts() / len(strat_test_set)

    
    
    
    
    
    
    
    


In [None]:
strat_train_set.copy()


In [None]:
house.plot(kind="scatter",x="longitude",y="latitude",grid="True",alpha=0.2)
plt.show()

In [None]:
house.plot(kind="scatter",x="longitude",y="latitude",grid="True",s=(house["population"])/1000,c=house["median_house_value"],cmap="jet",colorbar=True,legend=True
,figsize=(12,10))
plt.show()

In [None]:
house["med_inc_c"] = pd.cut(house["median_income"], bins=[0, 1, 3, 4, 6, np.inf], labels=[1, 2, 3, 4, 5])


In [None]:
from pandas.plotting import scatter_matrix
attributes=["median_house_value","population","median_income","total_rooms","latitude","longitude"]
scatter_matrix(house[attributes],figsize=(12,10))
                                               



In [None]:
house.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.2,grid=True)
plt.show()

In [None]:
house.plot(kind="scatter",x="housing_median_age",y="median_house_value",alpha=0.2,grid=True)
plt.show()

In [None]:
house.plot(kind="scatter",x="housing_median_age",y="population",alpha=0.2,grid=True)
plt.show()

In [None]:
housing_labels=strat_train_set["median_house_value"].copy()#separating the labels and the predictors

















In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer( strategy='median')
housing_num=house.select_dtypes(include=[np.number])
imputer.fit(housing_num)
imputer.statistics_





In [None]:
X=imputer.transform(housing_num)



In [None]:
#as imputing changes the dataset into a array so it needs to be again changed into a data frame with features and labels
house_df=pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)



In [None]:
housing_head=house[["ocean_proximity"]]
housing_head.head(10)
from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder()
encoded_op=encoder.fit_transform(housing_head)
print(encoder.categories_)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder_oh=OneHotEncoder()
encoded_oh=encoder_oh.fit_transform(housing_head)
print(encoded_oh.toarray())


In [None]:
df_test=pd.DataFrame({"ocean_proximity" : ["INLAND","NEAR BAY"]} )
print(pd.get_dummies(df_test))

In [None]:
df_unknown_test=pd.DataFrame({"ocean_proximity" : ["ISLAND","NEAR OCEAN"]} )
print(pd.get_dummies(df_unknown_test))

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
encoder.handle_unknown="ignore"

encoded_oh=encoder.fit_transform(df_unknown_test)
print(encoded_oh.toarray())




In [None]:
print(encoder.feature_names_in_)
print(encoder.get_feature_names_out())
df_output=pd.DataFrame(encoder.transform(df_unknown_test),columns=encoder.get_feature_names_out(),index=df_unknown_test.index)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(-1,1))
housing_num_scaled=scaler.fit_transform(housing_num)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
housing_num_scaled_st=scaler.fit_transform(housing_num)

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
age_simil_35=rbf_kernel(house["median_house_age"],[35],gamma=0.1)



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
target_scaler=StandardScaler()
scaled=target_scaler.fit_transform(housing_labels.to_frame())

model=LinearRegression()
model.fit(house["median_income"],scaled)
new=house[["median_income"]].iloc[:5]
prediction=model.predict(new)
inverse=target_scaler.inverse_transform(prediction)







In [None]:
from sklearn.compose import TransformedTargetRegressor
model=TransformedTargetRegressor()
model.fit(house["median_income"],housing_labels)
new=house[["median_income"]].iloc[:5]
prediction=model.predict(new)


In [None]:
from sklearn.preprocessing import FunctionTransformer
transformer=FunctionTransformer(np.log,inverse_func=np.exp)
log_age=transformer.transform(house[["median_house_age"]])



In [None]:
from sklearn.preprocessing import FunctionTransformer
transformer=FunctionTransformer(rbf_kernel,kw_args=dict(Y=[[35.]],gamma=0.1))
age_simil35=transformer.fit_transform(house[["median_house_age"]])




In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel
sf_coords=37.7749, -122.41
transformer=FunctionTransformer(rbf_kernel(sf_coords,gamma=0.1))
transformer.transform(house[["latitude","longitude"]])


In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import pairwise_distances
sf_cords=37.7749, -122.41
transformer_pairdis=FunctionTransformer(pairwise_distances(sf_coords))
transformer_pairdis.transform(house[["latitude","longitude"]])



In [None]:
from sklearn import BaseEstimator,TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
sf_coords=37.7749, -122.41



class ClusterEstimator(BaseEstimator,TransformerMixin):
    def __init__(self,n_clusters=5,gamma=1.0,random_state=222):
        self.n_clusters=n_clusters
        self.gamma=gamma
        self.random_state=random_state
    def fit(self,X,y,sample_weight=None):
        self.kmeans=KMeans(self.n_clusters,random_state=self.random_state)
        self.kmeans.fit(X,sample_weight=sample_weight)
        return self 
    def transform(self,X):
        return rbf_kernel(X,self.kmeans.clusters_centers_,gamma=0.1)
    def get_features_name_out(self,names=None):
        return [f"Cluster Similarity{i}" for i in range(self.n_clusters)]
     
    
        
        
        
        
        

In [1]:

geo_simil=ClusterEstimator(n_clusters=10,gamma=1.,random_state=222)
similarities=geo_simil.fit_transform(house[["longitude","latitude"]],sample_weight=housing_labels)

NameError: name 'ClusterEstimator' is not defined

In [None]:
from sklearn import BaseEstimator,TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
class ClusterEstimator(BaseEstimator,TransformerMixin):
    def __init__(self,n_clusters=10,gamma=0.1,random_state=22):
        self.n_clusters=n_clusters
        self.gamma=gamma
        self.random_state=random_state
    def fit(self,X,y=None,sample_weights=None):
        self.kmeans=KMeans(self.n_clusters,random_state=self.random_state)
        self.kmeans.fit(X,y=None,sample_weight=None)
        return self
    def transform(self,X):
        return rbf_kernel(X,self.kmeans,gamma=2.)
    
        

In [None]:

from sklearn import BaseEstimator,TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel



class RBF(BaseEstimator,TransformerMixin):
    def __init__(self,gamma=0.1):
        self.gamma=gamma
    def fit(self,X):
        return self
    def transform(self,X):
        return rbf_kernel(X,self.gamma)
    
        
        

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipe=Pipeline([
    "impute",SimpleImputer(strategy='Median'),
    "scaler",StandardScaler(),
    "rbf",rbf_kernel(gamma=0.1),
    
    
    
])
my_pipe.fit(X,)
    

