In [6]:
# Test date: 21 Jan 2025
# Code from Google AI Studio generated using the following prompt
"""
Attached is a csv file called houses.99.txt and it is delimited
by ",".  The first row is a header.  The remainder rows contain 
the numberical data.  The first four columns contain the input
features, X_train, which are for predicting the house prices.
The fifth column contains the house prices in units of 
1000's of dollars, y_train. 
We wish to fit a linear model y = w.X + b, where w are the weights,
b is the bias value and X is the input feature set and 
y is the output house price in dollars.
Please give a python code to determine the linear model 
for X_train and y_train using sklearn and the SGDRegressor.  
Use scaling for X_train.  
Please also include the code for reading X_train and y_train 
from the houses99.txt file.  
Using this code, determine the weights and bias and show the model.  
Calculate the weights and the bias using this code, and give the model.  
Print the mean and standard deviation, for each column in X_train.  
Finally, predict the house price for a new feature set [1200, 3, 1, 40].
Give the scaled values for this feature set.  
Also, provide the python code listing and let the 
print statements for numbers be to 8 decimal places.
"""

import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

def load_data(filename):
    """Loads data from a CSV file and returns X_train and y_train."""
    data = np.genfromtxt(filename, delimiter=',', skip_header=1)
    X_train = data[:, :-1]
    y_train = data[:, -1]
    return X_train, y_train

def scale_features(X_train):
    """Scales the features in X_train using StandardScaler."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    return X_train_scaled, scaler

def train_model(X_train_scaled, y_train):
    """Trains a linear model using SGDRegressor."""
    model = SGDRegressor(max_iter=1000, tol=1e-3, eta0=0.01, random_state=42)
    model.fit(X_train_scaled, y_train)
    return model

def print_statistics(X_train):
    """Calculates and prints the mean and std deviation of each column in X_train."""
    print("Mean and Standard Deviation for each column in X_train:")
    for i in range(X_train.shape[1]):
        mean = np.mean(X_train[:, i])
        std = np.std(X_train[:, i])
        print(f"Column {i+1}: Mean={mean:.8f}, Std={std:.8f}")

def predict_price(model, scaler, new_feature):
    """Predicts house price for a new feature set."""
    new_feature_scaled = scaler.transform(np.array(new_feature).reshape(1, -1))
    predicted_price = model.predict(new_feature_scaled)[0]
    return predicted_price, new_feature_scaled

# Load the data
X_train, y_train = load_data('./data/houses99.txt')

# Scale the features
X_train_scaled, scaler = scale_features(X_train)

# Train the model
model = train_model(X_train_scaled, y_train)

# Get weights and bias
weights = model.coef_
bias = model.intercept_[0] #Corrected line: Access the first (and only) element

# Print mean and standard deviation
print_statistics(X_train)

# Print the model
print("\nLinear Model:")
print(f"y = {bias:.8f} + ", end="")
for i, w in enumerate(weights):
    print(f"{w:.8f} * x{i+1} + ", end="")
print("\n")


# Prediction and Scaling for new feature set
new_feature = [1200, 3, 1, 40]
predicted_price, new_feature_scaled = predict_price(model, scaler, new_feature)

# Print results
print(f"Scaled values of the new feature set {new_feature}: {new_feature_scaled}")
print(f"Predicted house price for {new_feature}: {predicted_price:.8f}")

Mean and Standard Deviation for each column in X_train:
Column 1: Mean=1418.37373737, Std=411.61562893
Column 2: Mean=2.71717172, Std=0.65196523
Column 3: Mean=1.38383838, Std=0.48631932
Column 4: Mean=38.38383838, Std=25.77788069

Linear Model:
y = 363.16331343 + 110.28069076 * x1 + -21.13073419 * x2 + -32.54592156 * x3 + -38.01263833 * x4 + 

Scaled values of the new feature set [1200, 3, 1, 40]: [[-0.53052829  0.43380884 -0.78927234  0.06269567]]
Predicted house price for [1200, 3, 1, 40]: 318.79395493
