In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/california-housing-prices/housing.csv


# PART 1

In [2]:
import csv
import random

def load_csv(filename):
    with open(filename, newline='') as f:
        return list(csv.reader(f))

In [3]:
def label_encode_column(rows, header, column_name):
    col_idx = header.index(column_name)
    label_map = {}
    next_code = 0

    for row in rows:
        label = row[col_idx]
        if label not in label_map:
            label_map[label] = next_code
            next_code += 1
        row[col_idx] = str(label_map[label])
    
    print("Label Encoding:")
    for label, code in label_map.items():
        print(f"{code} → {label}")
    
    return rows

In [4]:
def convert_to_float(rows, header, target_col):
    target_idx = header.index(target_col)
    X, y = [], []

    for row in rows:
        try:
            x_row = [1.0]  # Bias term
            for i, val in enumerate(row):
                if i != target_idx:
                    x_row.append(float(val))
            y_val = float(row[target_idx])
            X.append(x_row)
            y.append(y_val)
        except:
            continue  

    return X, y

In [5]:
def split_manual(X, y, test_ratio=0.2):
    combined = list(zip(X, y))
    random.shuffle(combined)
    split = int(len(combined) * (1 - test_ratio))
    train = combined[:split]
    test = combined[split:]
    X_train, y_train = zip(*train)
    X_test, y_test = zip(*test)
    return list(X_train), list(X_test), list(y_train), list(y_test)

In [6]:
def transpose(matrix):
    return [list(row) for row in zip(*matrix)]

def matmul(A, B):
    result = [[0 for _ in range(len(B[0]))] for _ in range(len(A))]
    for i in range(len(A)):
        for j in range(len(B[0])):
            for k in range(len(B)):
                result[i][j] += A[i][k] * B[k][j]
    return result

In [7]:
def inverse(matrix):
    n = len(matrix)
    identity = [[float(i == j) for i in range(n)] for j in range(n)]
    for i in range(n):
        factor = matrix[i][i]
        for j in range(n):
            matrix[i][j] /= factor
            identity[i][j] /= factor
        for k in range(n):
            if k != i:
                factor = matrix[k][i]
                for j in range(n):
                    matrix[k][j] -= factor * matrix[i][j]
                    identity[k][j] -= factor * identity[i][j]
    return identity

In [8]:
def train_linear_regression(X, y):
    X_T = transpose(X)
    XTX = matmul(X_T, X)
    XTy = matmul(X_T, [[val] for val in y])
    XTX_inv = inverse(XTX)
    beta = matmul(XTX_inv, XTy)
    return [b[0] for b in beta]

In [9]:
def predict(X, beta):
    predictions = []
    for row in X:
        pred = sum(coef * val for coef, val in zip(beta, row))
        predictions.append(pred)
    return predictions

In [10]:
def mean_squared_error(y_true, y_pred):
    n = len(y_true)
    return sum((yt - yp) ** 2 for yt, yp in zip(y_true, y_pred)) / n


In [11]:
def main():
    header, *rows = load_csv("/kaggle/input/california-housing-prices/housing.csv")

    rows = label_encode_column(rows, header, "ocean_proximity") # label encoding the ocean proximity column

    X, y = convert_to_float(rows, header, "median_house_value")

    X_train, X_test, y_train, y_test = split_manual(X, y)

    beta = train_linear_regression(X_train, y_train)

    y_pred = predict(X_test, beta)

    # evaluation using mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"\nMean Squared Error: {mse:.2f}")

if __name__ == "__main__":
    main()

Label Encoding:
0 → NEAR BAY
1 → <1H OCEAN
2 → INLAND
3 → NEAR OCEAN
4 → ISLAND

Mean Squared Error: 5011916201.41


# PART 2

# Optimization using numpy

In [12]:
import csv
import numpy as np

def load_csv_numpy(filepath):
    with open(filepath, newline='') as f:
        reader = csv.reader(f)
        header = next(reader)
        data = [row for row in reader if all(row)]
    return header, data

In [13]:
def label_encode_column_numpy(data, header, column_name):
    col_idx = header.index(column_name)
    labels = [row[col_idx] for row in data]
    unique_labels = sorted(set(labels))  # Sorted for reproducibility
    label_map = {label: i for i, label in enumerate(unique_labels)}
    
    for row in data:
        row[col_idx] = str(label_map[row[col_idx]])
    
    print("Label encoding map:")
    for code, label in enumerate(unique_labels):
        print(f"{code} → {label}")
    
    return data

In [14]:
def preprocess_data_numpy(data, header, target_column):
    target_idx = header.index(target_column)
    X, y = [], []

    for row in data:
        try:
            row_floats = list(map(float, row))
            y.append(row_floats[target_idx])
            X.append([1.0] + row_floats[:target_idx] + row_floats[target_idx+1:])  # bias term
        except ValueError:
            continue

    return np.array(X), np.array(y)

In [15]:
def train_test_split_numpy(X, y, test_ratio=0.2):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    split = int((1 - test_ratio) * len(indices))
    train_idx, test_idx = indices[:split], indices[split:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

In [16]:
def linear_regression_normal_eq(X, y):
    XT_X = X.T @ X
    XT_y = X.T @ y
    beta = np.linalg.pinv(XT_X) @ XT_y  # Used pseudo-inverse for stability
    return beta

In [34]:
def main():
    header, data = load_csv_numpy("/kaggle/input/california-housing-prices/housing.csv")
    data = label_encode_column_numpy(data, header, "ocean_proximity")
    X, y = preprocess_data_numpy(data, header, "median_house_value")

    X_train, X_test, y_train, y_test = train_test_split_numpy(X, y)

    beta = linear_regression_normal_eq(X_train, y_train)

    y_pred = X_test @ beta

    mse = np.mean((y_test - y_pred) ** 2)
    print(f"\nMean Squared Error: {mse:.2f}")

if __name__ == "__main__":
    main()

Label encoding map:
0 → <1H OCEAN
1 → INLAND
2 → ISLAND
3 → NEAR BAY
4 → NEAR OCEAN

Mean Squared Error: 48732198833.06


## Adding some more feature engineering to reduce Mean Squared Error


In [33]:
def load_csv_numpy(filepath):
    with open(filepath, newline='') as f:
        reader = csv.reader(f)
        header = next(reader)
        data = [row for row in reader if all(row)]  # Removing rows with empty values
    return header, data


def feature_engineering_numpy(data, header):
    idx_total_rooms = header.index("total_rooms")
    idx_households = header.index("households")
    idx_population = header.index("population")
    idx_total_bedrooms = header.index("total_bedrooms")

    new_data = []
    for row in data:
        try:
            tr = float(row[idx_total_rooms])
            hh = float(row[idx_households])
            pop = float(row[idx_population])
            tb = float(row[idx_total_bedrooms])

            rooms_per_household = tr / hh if hh != 0 else 0
            population_per_household = pop / hh if hh != 0 else 0
            bedrooms_per_room = tb / tr if tr != 0 else 0

            row.extend([str(rooms_per_household), str(population_per_household), str(bedrooms_per_room)])
            new_data.append(row)
        except ValueError:
            continue

    header.extend(["rooms_per_household", "population_per_household", "bedrooms_per_room"])
    return header, new_data


def main():
    filepath = "/kaggle/input/california-housing-prices/housing.csv"
    header, data = load_csv_numpy(filepath)
    data = label_encode_column_numpy(data, header, "ocean_proximity")
    header, data = feature_engineering_numpy(data, header)
    X, y = preprocess_data_numpy(data, header, "median_house_value")
    X_train, X_test, y_train, y_test = train_test_split_numpy(X, y)
    beta = linear_regression_normal_eq(X_train, y_train)
    y_pred = X_test @ beta
    mse = np.mean((y_test - y_pred) ** 2)
    print(f"\nMean Squared Error: {mse:.2f}")

if __name__ == "__main__":
    main()


Label encoding map:
0 → <1H OCEAN
1 → INLAND
2 → ISLAND
3 → NEAR BAY
4 → NEAR OCEAN

Mean Squared Error: 47118492515.33


# PART 3

## Training using linear regression from scikit learn

In [19]:

from sklearn.linear_model import LinearRegression 

def load_csv_numpy(filepath):
    with open(filepath, newline='') as f:
        reader = csv.reader(f)
        header = next(reader)
        data = [row for row in reader if all(row)]
    return header, data



def preprocess_data_numpy(data, header, target_column):
    target_idx = header.index(target_column)
    X, y = [], []

    for row in data:
        try:
            row_floats = list(map(float, row))
            y.append(row_floats[target_idx])
            features = row_floats[:target_idx] + row_floats[target_idx+1:]
            X.append(features)
        except ValueError:
            continue

    X = np.array(X)
    y = np.array(y)

    # Normalized features using Z-score
    means = X.mean(axis=0)
    stds = X.std(axis=0)
    stds[stds == 0] = 1  
    X_norm = (X - means) / stds

    return X_norm, y


def main():
    filepath = "/kaggle/input/california-housing-prices/housing.csv"
    header, data = load_csv_numpy(filepath)
    data = label_encode_column_numpy(data, header, "ocean_proximity")
    header, data = feature_engineering_numpy(data, header)
    X, y = preprocess_data_numpy(data, header, "median_house_value")
    X_train, X_test, y_train, y_test = train_test_split_numpy(X, y)

    
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = np.mean((y_test - y_pred) ** 2)
    print(f"\nMean Squared Error (sklearn): {mse:.2f}")

if __name__ == "__main__":
    main()


Label encoding map:
0 → <1H OCEAN
1 → INLAND
2 → ISLAND
3 → NEAR BAY
4 → NEAR OCEAN

Mean Squared Error (sklearn): 4692023724.73
