<a href="https://colab.research.google.com/github/rifat01-rahman/Project-of-Python/blob/main/Hands_on_ML_Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tarfile
import urllib.request
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    with tarfile.open(tgz_path) as housing_tgz:
        housing_tgz.extractall(path=housing_path)

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# IMPORTANT: order matters
fetch_housing_data()
housing = load_housing_data()


  housing_tgz.extractall(path=housing_path)


In [2]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
housing .info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
housing ['ocean_proximity']. value_counts ()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,9136
INLAND,6551
NEAR OCEAN,2658
NEAR BAY,2290
ISLAND,5


In [5]:
import numpy as np

In [6]:
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])

In [7]:
# Here we are using different samplimg technique, because target variable is more effectetd by median income variable then other variable
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [8]:
for set_ in (strat_train_set, strat_test_set): # just remove income_cat data
    set_.drop("income_cat", axis=1, inplace=True)

In [26]:
housing_train = strat_train_set.copy()
housing_test = strat_test_set.copy()

In [10]:
X_train = housing_train.drop("median_house_value", axis=1)
y_train = housing_train["median_house_value"].copy()

We are now having OneHot-Encoding technique for train data set

In [11]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

# column indices in numerical data
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]

    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X,
                     rooms_per_household,
                     population_per_household,
                     bedrooms_per_room]

    return np.c_[X,
                 rooms_per_household,
                 population_per_household]


In [12]:
num_attribs = X_train.drop("ocean_proximity", axis=1).columns
cat_attribs = ["ocean_proximity"]

In [13]:
preprocessor = ColumnTransformer(transformers=[
    (
        "num",
        Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("attribs_adder",
             FunctionTransformer(
                 add_extra_features,
                 kw_args={"add_bedrooms_per_room": True}

             )),
            ("scaler", StandardScaler())
        ]),
        num_attribs
    ),
    (
        "cat",
        Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]),
        cat_attribs
    )
])


In [14]:
X_train_prepared = preprocessor.fit_transform(X_train)

In [29]:
num_features = list(num_attribs)

num_features.extend([
    "rooms_per_household",
    "population_per_household",
    "bedrooms_per_room"
])


In [16]:
cat_features = preprocessor.named_transformers_["cat"] \
    .named_steps["onehot"] \
    .get_feature_names_out(cat_attribs)


In [17]:
all_features = num_features + list(cat_features)

In [18]:
X_train_prepared_df = pd.DataFrame(
    X_train_prepared.toarray()
    if hasattr(X_train_prepared, "toarray")
    else X_train_prepared,
    columns=all_features,
    index=housing_train.index
)

In [19]:
X_train_prepared_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.94135,1.347438,0.027564,0.584777,0.640371,0.732602,0.556286,-0.893647,0.017395,0.006223,-0.121122,0.0,1.0,0.0,0.0,0.0
15502,1.171782,-1.19244,-1.722018,1.261467,0.781561,0.533612,0.721318,1.292168,0.569256,-0.040811,-0.810867,0.0,0.0,0.0,0.0,1.0
2908,0.267581,-0.125972,1.22046,-0.469773,-0.545138,-0.674675,-0.524407,-0.525434,-0.018024,-0.075371,-0.338273,0.0,1.0,0.0,0.0,0.0
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.036367,-0.467617,-0.037297,-0.865929,-0.59514,-0.106803,0.961205,0.0,0.0,0.0,0.0,1.0
20496,0.437431,-0.635818,-0.131489,0.427179,0.27279,0.37406,0.220898,0.325752,0.251241,0.006109,-0.474513,1.0,0.0,0.0,0.0,0.0


In [20]:
y_train.head()

Unnamed: 0,median_house_value
12655,72100.0
15502,279600.0
2908,82700.0
14053,112500.0
20496,238300.0


In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lin_reg = LinearRegression()

In [23]:
lin_reg.fit(X_train_prepared_df, y_train)

In [27]:
# Now we are going to prepare my test data set
X_test = housing_test.drop("median_house_value", axis=1)
y_test = housing_test["median_house_value"].copy()


In [28]:
X_test_prepared = preprocessor.fit_transform(X_test)

In [30]:
X_test_prepared_df = pd.DataFrame(
    X_test_prepared.toarray()
    if hasattr(X_test_prepared, "toarray")
    else X_test_prepared,
    columns=all_features,
    index=housing_test.index
)

In [31]:
X_test_prepared_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
5241,0.57507,-0.696573,0.032956,1.602218,1.025754,0.61433,1.104803,2.358455,0.731012,-0.436991,-0.884249,1.0,0.0,0.0,0.0,0.0
17352,-0.434801,-0.334668,-0.362981,-0.285055,-0.534263,-0.495927,-0.556024,0.617359,0.962667,0.033956,-0.961419,1.0,0.0,0.0,0.0,0.0
3505,0.545222,-0.635472,0.587268,-0.526772,-0.616136,-0.535191,-0.57565,0.284222,-0.054727,-0.01844,-0.438265,1.0,0.0,0.0,0.0,0.0
7777,0.719338,-0.795274,0.508081,-0.44151,-0.494433,-0.314643,-0.511866,-0.308135,0.059435,0.374929,-0.27147,1.0,0.0,0.0,0.0,0.0
14155,1.231735,-1.331081,0.745643,0.464824,0.145064,0.038734,0.25599,0.267724,0.423243,-0.336103,-0.799156,0.0,0.0,0.0,0.0,1.0


In [32]:
y_pred = lin_reg.predict(X_test_prepared_df)

In [34]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

np.float64(66975.77789893825)