#### __Load the calfornia housing dataset__

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

# import the dataset as a dataframe
housing = fetch_california_housing(as_frame=True)

X = housing.data
y = housing.target

print(X.shape)   # number of rows and features
print(housing.feature_names)


(20640, 8)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


#### __Create a DataFrame from the loaded data, print first 5 entries__

In [80]:
df_house = housing.frame

print(df_house.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


#### __Add the target variable to the DataFrame, print first 5 entries__

In [81]:
df_house['Value'] = housing.target
df_house.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,Value
20582,3.7574,33.0,6.135678,1.18593,488.0,2.452261,38.9,-122.16,0.927,0.927
8785,3.6797,21.0,4.233418,1.042092,3240.0,2.066327,33.79,-118.32,2.711,2.711
19664,3.5547,25.0,6.172414,1.091954,882.0,3.37931,37.49,-120.82,1.224,1.224
8016,5.5767,36.0,5.993056,0.996528,820.0,2.847222,33.83,-118.11,2.181,2.181
14055,2.1292,23.0,3.538938,1.046018,2051.0,1.815044,32.75,-117.13,1.35,1.35


#### __Display the Column-name, data-type, non-null count for this dataframe__

In [82]:
df_house.info()

<class 'pandas.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
 9   Value        20640 non-null  float64
dtypes: float64(10)
memory usage: 1.6 MB


#### __Divide the data into the train (0.8) and test (0.2) print the shape just for verification__

In [83]:
from sklearn.model_selection import train_test_split

df_house_train, df_house_test = train_test_split(
    df_house,
    test_size=0.2,
    random_state=42,
)

print(df_house.shape)
print(df_house_train.shape)
print(df_house_test.shape)

(20640, 10)
(16512, 10)
(4128, 10)


#### __Train the model using the input features__ 

In [84]:
from sklearn.linear_model import LinearRegression

X_train = df_house_train.drop(["MedHouseVal", "Value"], axis=1)
y_train = df_house_train["MedHouseVal"]

X_test = df_house_test.drop(["MedHouseVal", "Value"], axis=1)
y_test = df_house_test["MedHouseVal"]


model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


#### __Predictions on training data__

In [85]:
y_train_pred = model.predict(X_train)

#### __Compute the R2 score on training data__

In [86]:
from sklearn.metrics import r2_score
r2_train = (r2_score(y_train, y_train_pred))
print(r2_train)
print(df_house.columns)


0.6125511913966952
Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal', 'Value'],
      dtype='str')


#### __Compute the adjusted R2 score on training data__

#### __Predict the value for test data__

#### __Compute the R2 score on testing data__

#### __Compute the adjusted R2 score on testing data__

#### __Let us add 10 random columns to training data__

In [87]:
import numpy as np

rng = np.random.default_rng(42)
for i in range(1, 11):
    df_house_train[f'random_{i}'] = rng.standard_normal(len(df_house_train))

df_house_train.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal', 'Value', 'random_1', 'random_2',
       'random_3', 'random_4', 'random_5', 'random_6', 'random_7', 'random_8',
       'random_9', 'random_10'],
      dtype='str')

#### __Let us add 10 random columns to testing data__

In [88]:
rng = np.random.default_rng(42)
for i in range(1, 11):
    df_house_test[f'random_{i}'] = rng.standard_normal(len(df_house_test))

df_house_test.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal', 'Value', 'random_1', 'random_2',
       'random_3', 'random_4', 'random_5', 'random_6', 'random_7', 'random_8',
       'random_9', 'random_10'],
      dtype='str')

#### __Check the shape of training and test data__

In [89]:
df_house_train.shape, df_house_test.shape

((16512, 20), (4128, 20))

#### __Change the sequence of column in both train and test__ 

In [90]:
df_house_train = df_house_train[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude',
    'random_1', 'random_2', 'random_3', 'random_4', 'random_5', 'random_5', 'random_7', 'random_8', 'random_9', 'random_10', 'Value']]
df_house_test = df_house_test[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude',
    'random_1', 'random_2', 'random_3', 'random_4', 'random_5', 'random_5', 'random_7', 'random_8', 'random_9', 'random_10', 'Value']]

#### __Now again train the model, predict the result for train data, check the r2 score__

#### __Again compute the adjusted R2 score on training data__

#### __Again Compute the R2 score for test data__

#### __Again compute the adjusted R2 score on the test data__