In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

#### Task 1:
##### Having imported the data, write code to study the following data characteristics:
        a) number of rows and columns for the independent variables
        b) labels of the columns for the independent variables and their meaning
        c) target variable values and their meaning

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
housing = fetch_california_housing()

# Convert the dataset to a DataFrame
df = pd.DataFrame(data=housing.data, columns=housing.feature_names)

# a) Number of rows and columns for the independent variables
num_rows, num_cols = df.shape
print("Number of rows for the independent variables:", num_rows)
print("Number of columns for the independent variables:", num_cols)

# b) Labels of the columns for the independent variables and their meaning
print("\nLabels of the columns for the independent variables and their meaning:")
print(df.head())

# c) Target variable values and their meaning
target = pd.DataFrame(data=housing.target, columns=['Target'])
print("\nTarget variable values and their meaning:")
print(target.head())


Number of rows for the independent variables: 20640
Number of columns for the independent variables: 8

Labels of the columns for the independent variables and their meaning:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

Target variable values and their meaning:
   Target
0   4.526
1   3.585
2   3.521
3   3.413
4   3.422


In [2]:
housing['feature_names']

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [3]:
housing['target']

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

#### Task 2:
##### Write the code to train prediction models with a data split ratio 80/20 between training and test data. Your code should also consider reshuffling of the rows.

In [None]:
from sklearn import preprocessing

"""Note it should be fit on the training set only"""
scaler = preprocessing.StandardScaler().fit(X_trn)
"""Apply scaling parameters on both the training set and the validation set"""
scaled_trnX = scaler.transform(X_trn)      
scaled_valX = scaler.transform(X_val) 

In [2]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load the California housing dataset
housing = fetch_california_housing()

# Split the dataset into features and target variable
X = housing.data
y = housing.target

# Split the data into training and test sets with a ratio of 80/20 and shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Now you can train your prediction models using X_train and y_train, and evaluate them on X_test and y_test


#### Task 2:
##### Having performed scaling for all values, you should develop the following regression models:
        a) a Linear Regression model by displaying its intercept, trained coefficients, RMSE score as fitness metric.
        b) a Stochastic Gradient Descent with Warm Restarts model, which is a variant of the stochastic gradient descent (SGD) optimisation algorithm commonly used in machine learning for training linear models, including linear regression models. You should display its intercept, trained coefficients, RMSE score as fitness metric. 
        b.1) For the model above, you should use 10 iterations as maximum and set both tol and eta, which are essential hyperparameters that need to be tuned carefully to achieve the desired balance between convergence speed and solution quality, as you think appropriate.
        c) Prepare the data and develop a model of a higher degree polynomial, for instance, degree = 2. You should display its intercept, trained coefficients, RMSE score as fitness metric.

In [3]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Load the California housing dataset
housing = fetch_california_housing()
X = housing.data
y = housing.target

# Split the data into training and test sets with a ratio of 80/20 and shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# a) Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

print("Linear Regression Model:")
print("Intercept:", linear_model.intercept_)
print("Coefficients:", linear_model.coef_)
y_pred_linear = linear_model.predict(X_test_scaled)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print("RMSE:", rmse_linear)

# b) Stochastic Gradient Descent with Warm Restarts model
sgd_model = SGDRegressor(max_iter=10, tol=1e-3, eta0=0.01)
sgd_model.fit(X_train_scaled, y_train)

print("\nStochastic Gradient Descent with Warm Restarts Model:")
print("Intercept:", sgd_model.intercept_)
print("Coefficients:", sgd_model.coef_)
y_pred_sgd = sgd_model.predict(X_test_scaled)
rmse_sgd = np.sqrt(mean_squared_error(y_test, y_pred_sgd))
print("RMSE:", rmse_sgd)

# c) Higher degree polynomial model (degree=2)
poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_model.fit(X_train_scaled, y_train)

print("\nHigher Degree Polynomial Regression Model (Degree=2):")
print("Intercept:", poly_model.named_steps['linearregression'].intercept_)
print("Coefficients:", poly_model.named_steps['linearregression'].coef_)
y_pred_poly = poly_model.predict(X_test_scaled)
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))
print("RMSE:", rmse_poly)


Linear Regression Model:
Intercept: 2.071946937378619
Coefficients: [ 0.85438303  0.12254624 -0.29441013  0.33925949 -0.00230772 -0.0408291
 -0.89692888 -0.86984178]
RMSE: 0.7455813830127763

Stochastic Gradient Descent with Warm Restarts Model:
Intercept: [2.06829884]
Coefficients: [ 0.82409267  0.11960539 -0.25835617  0.30263456 -0.00337096 -0.01616009
 -0.96573075 -0.92032148]
RMSE: 0.7432874102972927

Higher Degree Polynomial Regression Model (Degree=2):
Intercept: 1.956590491804071
Coefficients: [ 1.31223617e-16  9.35940108e-01  1.32058017e-01 -3.87598691e-01
  5.30206745e-01  4.05134644e-02 -1.78126342e+00 -1.27267893e+00
 -1.16762990e+00 -1.12225581e-01  3.78458372e-02  1.79781162e-01
 -1.20151603e-01  1.11429958e-01 -9.88397827e-02 -6.67216348e-01
 -5.86169281e-01  3.32914038e-02 -1.62467226e-02  5.23448500e-02
  3.60251996e-02 -2.78667461e-01 -2.76779199e-01 -2.52812539e-01
  6.04024501e-02 -1.09586039e-01 -1.54739809e-01  5.77923765e-01
  5.43530824e-01  4.79070686e-01  4.954

