**Libraries:**

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import  SimpleImputer
from sklearn.preprocessing import FunctionTransformer,StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

**Data Visualization**

In [None]:
file=pd.read_csv("file.csv")
# Check for duplicates and unique values
print(f"Unique Values in Feature1: {file.nunique()}")
print(f"To check duplicate values: {file.duplicated().sum()}")


**Data Cleaning:**

In [None]:
imputer = SimpleImputer(strategy='median')
imputer.fit(file.iloc[:, [4]])
# Select only numeric columns
numeric = file.select_dtypes(include=['number'])
# Apply imputer only on numeric columns
file[numeric.columns] = imputer.fit_transform(numeric)
file.info()
print("For precise analysis:")
file.isnull().sum()

#### Task 1: 

In [None]:
log_t = FunctionTransformer(np.log,inverse_func=np.exp)
ratio_t = FunctionTransformer(lambda X: X[:, [0]] / X[:, [1]])
log_t.transform(file[["population"]])
#to normalize data and remove outliers
ratio_t.transform(np.array([[1., 2.], [3., 4.]]))
X = file.drop(columns=["ocean_proximity"])
y = file.drop(columns=X)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
scal = StandardScaler()
scal.fit(X_train)
x_t_s = scal.transform(X_train)
x_tes_s = scal.transform(X_test)
scaled_df = pd.DataFrame(np.vstack((x_t_s,x_tes_s)),columns=X_test.columns)  
scaled_df.head()

#### Task 2: Building  Transformer 

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.mean_ = None
        self.std_ = None

    def fit(self, X, y=None):
        X = check_array(X)

        if self.with_mean:
            self.mean_ = np.mean(X, axis=0)
        if self.with_std:
            self.std_ = np.std(X, axis=0, ddof=0)  # Population standard deviation

        return self

    def transform(self, X):
        check_is_fitted(self, ['mean_', 'std_'])
        X = check_array(X)

        # Center and scale the data based on parameters
        X_scaled = X.copy()  # Avoid modifying the original data
        if self.with_mean:
            X_scaled -= self.mean_
        if self.with_std:
            X_scaled /= self.std_

        return X_scaled

    def fit_transform(self, X, y=None):

        return self.fit(X, y).transform(X)



In [None]:
# Make an object of that class
ssc = StandardScalerClone()
ssc.fit(X_train)

In [None]:
X_train_scaled = ssc.transform(X_train)
X_test_scaled = ssc.transform(X_test)

X_train_scaled,x_tes_s

#### Task 3: **Clustering  Transformer**

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=None, gamma=None, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]


In [None]:
X = file.iloc[:,[0,1]].values     # fitting longitude and latitude column
cs = ClusterSimilarity(n_clusters=10,gamma=1.0, random_state=42)
cs.fit(X)
value = cs.transform(X)
value.round(2)

#### Task 4: Pipelines 

In [40]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Numerical Pipeline
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

housing_num_prepared = numerical_pipeline.fit_transform(file.iloc[:, :-1])
housing_num_prepared = housing_num_prepared.round(2)

# Categorical Pipeline
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder()
)

values = categorical_pipeline.fit_transform(file.iloc[:, -1].values.reshape(-1, 1)).toarray()
# Geo Pipeline

geo_pipeline = Pipeline([('simpleimputer', SimpleImputer(strategy='median')),
                ('clustersimilarity', ClusterSimilarity(n_clusters=10,gamma=1.,random_state=42))])

# Fit the KMeans part separately first to check if it's working:
geo_pipeline.fit(file[['longitude', 'latitude']])
print(geo_pipeline[1].kmeans_.cluster_centers_)
# Column Transformer

categorical_columns = ["ocean_proximity"]
numerical_columns = ["longitude", "latitude", "housing_median_age","total_rooms","total_bedrooms", "population", "households","median_income"]
geo_columns = ['longitude', 'latitude']

preprocessing = ColumnTransformer([
    ('num', numerical_pipeline, numerical_columns),
    ('cat', categorical_pipeline, categorical_columns),
    ('geo', geo_pipeline, geo_columns)
])

preprocessing.fit_transform(file)

[[-117.37004959   33.99946006]
 [-122.3233122    37.94114105]
 [-119.45273676   36.32941413]
 [-118.22767889   34.00620281]
 [-116.95778575   32.89627278]
 [-121.2802038    38.92207201]
 [-123.04501057   40.48585624]
 [-120.06765138   34.76642202]
 [-120.96289655   37.77087685]
 [-121.92993224   37.24164314]]


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


array([[-1.32783522e+00,  1.05254828e+00,  9.82142658e-01, ...,
         5.74296847e-07,  1.98402197e-01,  6.08024476e-01],
       [-1.32284391e+00,  1.04318455e+00, -6.07018913e-01, ...,
         6.78874656e-07,  2.04282382e-01,  6.27190779e-01],
       [-1.33282653e+00,  1.03850269e+00,  1.85618152e+00, ...,
         6.62297019e-07,  1.94513504e-01,  6.27356647e-01],
       ...,
       [-8.23713197e-01,  1.77823747e+00, -9.24851228e-01, ...,
         9.50263455e-11,  5.96780010e-02,  5.02711239e-03],
       [-8.73626269e-01,  1.77823747e+00, -8.45393149e-01, ...,
         7.47152670e-11,  5.61228350e-02,  5.73640203e-03],
       [-8.33695812e-01,  1.75014627e+00, -1.00430931e+00, ...,
         1.58173034e-10,  7.17920755e-02,  6.69824703e-03]])

**TASK**

In [41]:
print(file.head())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


**Linear Regressor Model**

In [42]:
from sklearn.linear_model import LinearRegression

housing = file.drop(columns="median_house_value")
housing_labels = file["median_house_value"]

lin_reg = make_pipeline(
    preprocessing, 
    LinearRegression()
)

In [43]:
lin_reg.fit(housing, housing_labels)

  super()._check_params_vs_input(X, default_n_init=10)


In [45]:
housing_predictions = lin_reg.predict(file)
housing_predictions[:5].round(-2)
lin_rmse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse

4451898794.070488

**DecisionTree**

In [46]:
tree_reg = make_pipeline(
    preprocessing,
    DecisionTreeRegressor(random_state=42)
)
tree_reg.fit(housing, housing_labels)
housing_predictions = tree_reg.predict(housing)
tree_rmse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse
tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,scoring="neg_root_mean_squared_error",cv=10)
tree_rmses
pd.Series(tree_rmses).describe()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


count       10.000000
mean     64261.334691
std       3252.631465
min      58575.986639
25%      62351.337667
50%      64578.086737
75%      65510.992262
max      70665.255393
dtype: float64

**RandomForest**

In [48]:
forest_reg = make_pipeline(preprocessing,RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, housing, housing_labels,scoring="neg_root_mean_squared_error", cv=3)
pd.Series(forest_rmses).describe()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


count        3.000000
mean     47211.560463
std       1466.559613
min      46187.622300
25%      46371.525395
50%      46555.428489
75%      47723.529545
max      48891.630600
dtype: float64

**Grid Search**

In [49]:
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ('preprocessing',preprocessing),
    ('randomforestregressor',RandomForestRegressor(random_state=42))
])

param_grid = [
    {
        'preprocessing__geo__clustersimilarity__n_clusters': [5, 8, 10],  
        'randomforestregressor__max_features': [4, 6, 8]  
    },
    {
        'preprocessing__geo__clustersimilarity__n_clusters': [10, 15],  
        'randomforestregressor__max_features': [6, 8, 10] 
    }
]
g_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
g_search
g_search.fit(file.drop('median_house_value', axis=1), file['median_house_value'])
g_search.best_params_
counter_value_res = pd.DataFrame(g_search.counter_value_results_)
counter_value_res.sort_values(by="mean_test_score", ascending=False,inplace=True)

counter_value_res.head()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

KeyboardInterrupt: 

**Randomized Search**

In [50]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'preprocessing__geo__clustersimilarity__n_clusters': randint(low=3,high=50),
    'randomforestregressor__max_features': randint(low=2,high=20)
}
random_Search = RandomizedSearchCV(full_pipeline, param_distributions=param_distribs, n_iter=10,cv=3,scoring='neg_root_mean_squared_error', random_state=42)
random_Search.fit(file.drop('median_house_value', axis=1), file['median_house_value'])

  super()._check_params_vs_input(X, default_n_init=10)


KeyboardInterrupt: 

**Ensemble Methods**

In [None]:
main_model = random_Search.best_estimator_ # includes preprocessing
main_model
feature_importances = main_model["randomforestregressor"].feature_importances_.round(2)
feature_importances
sorted(zip(feature_importances,main_model["preprocessing"].get_feature_names_out()),reverse=True)

**Evalution:**

In [None]:
X_test = file.drop("median_house_value", axis=1)
y_test = file["median_house_value"].copy()
final_predictions = main_model.predict(X_test)
final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,loc=squared_errors.mean(),scale=stats.sem(squared_errors)))