In [None]:
import pandas as pd
import numpy as np
housing = pd.read_csv("housing.csv")

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# import numpy as np
# def split_train_test(data, test_ratio):
#     shuffled_indices = np.random.permutation(len(data))
    
#     test_set_size = int(len(data) * test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
import hashlib
def test_set_check(identifier, test_ratio, hash):
 return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
 ## np.int64(identifier) --> first we standardize the identifier format into 64-bit representation. Because Python’s int type can behave slightly differently across versions and systems.
 ## then we pass it to the hash function through hash, the output is a hexadecimal string, for example: 202cb962ac59075b964b07152d234b70
 ## digest sections it into bytes by transforming it into raw binary form: \x2c\xb9b\xacY\x07[\x96K\x07\x15-#Kp
 ## [-1] takes the last byte
 ## < 256* test_ratio returns true if it's less than 51
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    # This is a default argument, if the user doesn't define a hash function, use hashlib.md5 by default
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    ## It’s just a name for the variable. The underscore is used to avoid conflict with Python’s built-in function id().
    return data.loc[~in_test_set], data.loc[in_test_set]
    ## ~ in Python means not. loc selects rows based on booleans. the first part of the 
    ## return statement, are records that are 'False' or 'not in test set, or in other words the training set
    ## the other part is the data that is 'True' or is in the test set

In [None]:
housing_with_id = housing.reset_index() # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
mean = housing["median_income"].mean()
median = housing["median_income"].median()
mode = housing["median_income"].mode()[0]  # if multimodal, this gets the first
iqr = housing["median_income"].quantile(0.75) - housing["median_income"].quantile(0.25)

print("Mean:", mean)
print("Median:", median)
print("Mode:", mode)
print("IQR (middle 50% range):", iqr)

# 🧠 Income Distribution Summary

## 📌 Measures of Central Tendency

- **Mean = 3.87** → The average income  
  > لو جمعنا الفلوس دي كلها وقسمناها على العائلات بالتساوي، كل عيلة هتاخد المبلغ ده  

- **Median = 3.53** → Half the data is below, half is above  
  > دة بيقسم الداتا بالنص، نص الداتا على يمينه والنص التاني على شماله  

- **Mode = 3.13** → Most common value  
  > أطول عمود في الـ histogram  
  > لو الداتا دي symmetric, الmean, mode, median هيكونوا على نفس الخط

---

## 📊 IQR (Interquartile Range)

- **IQR = Q3 - Q1 = 4.75 - 2.57 = 2.18**
- Represents the middle 50% of the data (where "most values live")
- If you shade the region from **2.57 to 4.75**, that’s the IQR band  
  > الحبة اللي فيهم أطول عواميد في الداتا

---

## 📈 Visual Insight

- The **mode** at ~3.13 suggests the **peak of the histogram** is near 3.
- The **mean > median > mode** implies the distribution is **right-skewed**, probably due to a few people with very high incomes.

> This matches the author's eyeball estimate of **2–5** as the range where most values are concentrated.


In [None]:
import matplotlib.pyplot as plt

plt.hist(housing["median_income"], bins=50, edgecolor='black')
plt.axvline(mean, color='red', linestyle='--', label='Mean')
plt.axvline(median, color='blue', linestyle='-', label='Median')
plt.axvline(mode, color='green', linestyle='-.', label='Mode')
plt.axvspan(
    housing["median_income"].quantile(0.25),
    housing["median_income"].quantile(0.75),
    color='yellow', alpha=0.3, label='IQR Range (Q1–Q3)'
)
plt.legend()
plt.title("Distribution of Median Income")
plt.xlabel("Median Income")
plt.ylabel("Number of Houses")
plt.show()


# 📐 What is Skewness?

**Skewness** measures the **asymmetry** of a distribution.

- **Symmetric** → Left side ≈ Right side  
- **Positive (Right) Skewed** → Long tail on the right  
- **Negative (Left) Skewed** → Long tail on the left  

---

## 🧠 Rule of Thumb: Order of Mean, Median, Mode

| Skewness Type     | Order (from lowest to highest)     | Shape Insight                   |
|-------------------|-------------------------------------|----------------------------------|
| **Left Skew (−)** | Mean < Median < Mode               | Long tail on the **left**       |
| **Symmetric (0)** | Mean ≈ Median ≈ Mode               | Bell-shaped (like normal dist.) |
| **Right Skew (+)**| Mode < Median < Mean               | Long tail on the **right**      |

---

🔁 **Tip**: Think of the **mean** as the value that's most sensitive to outliers.

- In a **right-skewed** distribution (e.g., incomes), **extreme high values** will **pull the mean upward**.
- In a **left-skewed** distribution (e.g., test scores with penalties), **extreme low values** will **pull the mean downward**.


In [None]:
housing["median_income"].skew()

# Interpretation:

> 0 → Symmetric

> more than 0 → Right-skewed

> less than 0 → Left-skewed

In [None]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) ## 1 train test pair - test percentage - shuffling strategy
## split is an object of the StratifiedShuffleSplit class, it has a .split() method
for train_index, test_index in split.split(housing, housing["income_cat"]): ## since you set n_splits=1, it will loop once
    ## split.split is a generator that will output the indices of the train and test sets.
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
housing["income_cat"].value_counts() / len(housing) * 100 

In [None]:
import matplotlib.pyplot as plt

housing["income_cat"].value_counts().sort_index().plot(kind="barh")
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np

# Step 1: Create the income_cat column (if not already present)
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

# Step 2: Do random sampling (normal train_test_split)
from sklearn.model_selection import train_test_split
train_set_rand, test_set_rand = train_test_split(housing, test_size=0.2, random_state=42)

# Step 3: Do stratified sampling
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Step 4: Compute proportions
def income_cat_proportions(data):
    return data["income_cat"].value_counts(normalize=True).sort_index()

overall_props = income_cat_proportions(housing)
rand_props = income_cat_proportions(test_set_rand)
strat_props = income_cat_proportions(strat_test_set)

# Step 5: Compare and calculate %error
comparison_df = pd.DataFrame({
    "Overall": overall_props,
    "Random": rand_props,
    "Stratified": strat_props,
    "Rand. %error": 100 * (rand_props - overall_props) / overall_props,
    "Strat. %error": 100 * (strat_props - overall_props) / overall_props
})

print(comparison_df)


In [None]:
for set in (strat_train_set, strat_test_set):
 set.drop(["income_cat"], axis=1, inplace=True)

In [None]:
housing2 = strat_train_set.copy()

In [None]:
import folium
from folium.plugins import HeatMap

# Center the map around California
m = folium.Map(location=[37, -119], zoom_start=6)

# Extract lat/lon points
heat_data = [[row["latitude"], row["longitude"]] for index, row in housing2.iterrows()]

HeatMap(heat_data).add_to(m)

m  # if you're in Jupyter, this will display the map


In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
plt.show()


In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.2,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()
plt.show()
## raduis = s حجم الدايرة هو  عدد الناس
## color = c لون الدايرة هو السعر

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

corr_matrix = housing2.corr(numeric_only=True)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5)
plt.title("Correlation Heatmap of Housing Features")
plt.show()

## Notice that this measures linear correlation only, there may be other kind of correlation that goes undetected


In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms","housing_median_age"]
scatter_matrix(housing2[attributes], figsize=(12, 8))
plt.show()

# 📊 Exploratory Data Analysis: California Housing Dataset

This pairplot visualization presents relationships among four key variables:

- `median_house_value`
- `median_income`
- `total_rooms`
- `housing_median_age`

---

## 🔍 Key Insights

### 1. 💰 Income vs. House Value
- There is a clear **positive correlation** between `median_income` and `median_house_value`.
- As income increases, house value tends to increase.
- The relationship appears nonlinear and capped around the \$500,000 mark, possibly due to data truncation or a cap in the dataset.

### 2. 🏠 Total Rooms vs. House Value
- `total_rooms` has a **very weak or no clear correlation** with `median_house_value`.
- High room counts do not necessarily equate to high house values.
- There’s significant variance and clustering at lower values.

### 3. 🏡 Housing Age vs. House Value
- `housing_median_age` shows **no clear correlation** with `median_house_value`.
- Houses of all ages have a wide spread of values.
- Possible explanation: location and condition may matter more than age.

### 4. 🧮 Income vs. Total Rooms
- Slight positive trend: higher-income areas tend to have more rooms.
- But again, high dispersion suggests this isn't a strong linear relationship.

### 5. 🕰 Age vs. Total Rooms & Income
- No significant patterns between `housing_median_age` and other variables.
- Suggests that older and newer houses can exist in areas with varying income levels and room counts.

---

## 🧠 Summary

- **Median income** is the strongest predictor of **house value** among these variables.
- **Total rooms** and **housing age** show limited predictive power on their own.
- The distribution plots indicate skewness in `total_rooms` and `median_income`, which might require normalization or transformation for modeling.

In [None]:
housing2.plot(kind="scatter", x="median_income", y="median_house_value",alpha=0.1)
plt.show()

## 📈 Why Are There Horizontal Lines in the Scatter Plot?

This scatter plot shows the relationship between `median_income` and `median_house_value`.

### 🧠 So... what do those horizontal lines mean?

The horizontal lines at the top of the plot — especially the **thick one at \$500,000** — indicate that:

- **House values have been capped** at \$500,000 in the dataset.
- Multiple entries have exactly the same `median_house_value`, creating a "pile-up" at that price point.
- This is why you see a *solid horizontal bar* — many data points overlap at that capped value.

### 🔎 Why does this matter?

- This **data truncation** can distort analysis and modeling.
- It might:
  - Hide true relationships in higher-income areas.
  - Mislead regression models (e.g., underestimating the value in affluent areas).
- Consider treating this cap carefully:
  - Add a flag for "capped" data.
  - Impute or model these separately if appropriate.

> 💡 TL;DR: The horizontal lines = **lots of homes priced exactly at \$500,000**, likely because it's the upper limit recorded in the dataset.


In [None]:
cap_value = 500000
capped_count = (housing2['median_house_value'] == cap_value).sum()
print(f"Number of capped house values at ${cap_value}: {capped_count}")

In [None]:
close_to_cap = housing2[housing2['median_house_value'] >= 490000]
print(close_to_cap['median_house_value'].value_counts().sort_index())

In [None]:
housing2["rooms_per_household"] = housing2["total_rooms"]/housing2["households"]
housing2["bedrooms_per_room"] = housing2["total_bedrooms"]/housing2["total_rooms"]
housing2["population_per_household"] = housing2["population"]/housing2["households"]

housing2 = housing2.drop(columns={'total_rooms','total_bedrooms'},axis=1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

corr_matrix = housing2.corr(numeric_only=True)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5)
plt.title("Correlation Heatmap of Housing Features")
plt.show()

# Correlation Comparison After Feature Engineering
![](plot.png)

Data Cleaning

In [None]:
#bedrooms_per_room has 158 missing values
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [None]:
imputer.fit(housing_num)

In [None]:
import pandas as pd

pd.DataFrame(imputer.statistics_, index=housing_num.columns, columns=['median'])

In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_imputed = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

In [None]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

# Define the DataFrameSelector class
class DataFrameSelector(BaseEstimator, TransformerMixin):
	def __init__(self, attribute_names):
		self.attribute_names = attribute_names
	def fit(self, X, y=None):
		return self
	def transform(self, X):
		return X[self.attribute_names].values

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
	('selector', DataFrameSelector(num_attribs)),
	('imputer', SimpleImputer(strategy="median")),
	('attribs_adder', CombinedAttributesAdder()),
	('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
	('selector', DataFrameSelector(cat_attribs)),
	('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list=[
	("num_pipeline", num_pipeline),
	("cat_pipeline", cat_pipeline),
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape