# Linear regression Hands-on : handling overfitting

In [2]:
# prompt: import library to remove worning

import warnings
warnings.filterwarnings('ignore')


### Import dataset

In [3]:
# prompt: Import dataset : https://raw.githubusercontent.com/pritkudale/ML-for-Teachers/refs/heads/main/Linear%20Regression/Melbourne_housing_FULL.csv

import pandas as pd

# Load the dataset from the URL
df = pd.read_csv('https://raw.githubusercontent.com/pritkudale/ML-for-Teachers/refs/heads/main/Linear%20Regression/Melbourne_housing_FULL.csv')




### Explore dataset

In [4]:
# Display the first few rows of the DataFrame to verify the import
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [5]:
df.shape

(34857, 21)

### Check for missing values

In [6]:
# prompt: check for number of missing values

df.isnull().sum()


Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,7610
Method,0
SellerG,0
Date,0
Distance,1
Postcode,1


###Select important columns

In [7]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'BuildingArea',
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',  'Price']
df = df[cols_to_use]

In [8]:
df.isnull().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,3
Propertycount,3
BuildingArea,21115
Distance,1
CouncilArea,3


### Impute missing values

In [9]:
# Impute missing values for 'Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car'

In [10]:
# prompt: impute the value 0 for the columns 'Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car'

df[['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']] = df[['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']].fillna(0)

In [11]:
df.isnull().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,3
Propertycount,0
BuildingArea,21115
Distance,0
CouncilArea,3


In [12]:

# Impute 'Landsize' with the mean value
df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
df['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())

### Drop null Values

In [13]:
# prompt: drop all null values

# Drop rows with any remaining null values
df.dropna(inplace=True)

df.isnull().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,0
Propertycount,0
BuildingArea,0
Distance,0
CouncilArea,0


In [14]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,BuildingArea,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,Price
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,160.2564,2.5,Yarra City Council,2.0,1.0,1.0,202.0,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,79.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,1035000.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,150.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,1465000.0
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,160.2564,2.5,Yarra City Council,3.0,2.0,1.0,94.0,850000.0
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,142.0,2.5,Yarra City Council,3.0,1.0,2.0,120.0,1600000.0


In [15]:
# prompt: convert catogorical column with one hot encoding

import pandas as pd
# ... (Your existing code)

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['Suburb', 'Type', 'Method', 'SellerG', 'Regionname', 'CouncilArea'], drop_first=True)

df.head()

Unnamed: 0,Rooms,Propertycount,BuildingArea,Distance,Bedroom2,Bathroom,Car,Landsize,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,160.2564,2.5,2.0,1.0,1.0,202.0,1480000.0,False,...,False,False,False,False,False,False,False,False,True,False
2,2,4019.0,79.0,2.5,2.0,1.0,0.0,156.0,1035000.0,False,...,False,False,False,False,False,False,False,False,True,False
4,3,4019.0,150.0,2.5,3.0,2.0,0.0,134.0,1465000.0,False,...,False,False,False,False,False,False,False,False,True,False
5,3,4019.0,160.2564,2.5,3.0,2.0,1.0,94.0,850000.0,False,...,False,False,False,False,False,False,False,False,True,False
6,4,4019.0,142.0,2.5,3.0,1.0,2.0,120.0,1600000.0,False,...,False,False,False,False,False,False,False,False,True,False


### Data processing

In [16]:
# prompt: consider price as target variable

# Define features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']

In [17]:
# prompt: split data in training and testing

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [18]:
# prompt: import linear regression

from sklearn.linear_model import LinearRegression

# Initialize and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

In [19]:
# prompt: print lr_model score

print(lr_model.score(X_test, y_test))


0.1385368316152532


In [20]:
# prompt: print training score

print(lr_model.score(X_train, y_train))

0.6827792395792723


This is an example of overfitting

# Regularization

#### Using Lasso (L1 Regularized) Regression Model

In [21]:
# prompt: import lasso

from sklearn.linear_model import Lasso

# Initialize and train the Lasso regression model
lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1)  # Adjust alpha as needed
lasso_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lasso = lasso_reg.predict(X_test)

print(lasso_reg.score(X_test, y_test))
print(lasso_reg.score(X_train, y_train))

0.6637022593165034
0.6766957310228212


In [22]:
lasso_reg.coef_

array([ 2.70974843e+05,  4.62819176e+00,  8.62076499e+01, -3.03547043e+04,
       -8.48981182e+04,  1.23225330e+05,  4.17509554e+04,  2.56283629e+00,
        2.19205584e+05, -1.19998723e+05, -0.00000000e+00,  3.23968886e+05,
        1.55449558e+05,  2.97265138e+05,  1.38242239e+05, -9.39459868e+04,
       -1.30058547e+05, -0.00000000e+00,  1.51713227e+05, -9.60817846e+04,
       -1.73537614e+04, -0.00000000e+00,  2.24652131e+04, -1.43407352e+05,
       -0.00000000e+00, -2.72253165e+05,  0.00000000e+00, -3.52235351e+04,
        2.31160039e+05,  6.09296794e+04,  3.20576647e+04, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  1.90263151e+05, -1.90736163e+05,
        3.37246462e+04, -8.76829548e+04,  5.14483845e+04,  2.57712195e+05,
        1.86326616e+05, -5.57560772e+03, -0.00000000e+00,  6.25965997e+04,
        1.71304904e+04,  0.00000000e+00,  2.46027093e+05, -1.20834770e+05,
       -0.00000000e+00,  4.88797682e+05, -0.00000000e+00,  1.27239061e+04,
        0.00000000e+00,  

#### Using Ridge (L2 Regularized) Regression Model

In [23]:
# prompt: import ridge

from sklearn.linear_model import Ridge

# Initialize and train the Ridge regression model
ridge_reg=Ridge(alpha=50,max_iter=100,tol=0.1)
ridge_reg.fit(X_train,y_train)

# Make predictions on the test set
y_pred_ridge = ridge_reg.predict(X_test)

print(ridge_reg.score(X_test, y_test))
print(ridge_reg.score(X_train, y_train))

0.6670848945194958
0.6622376739684328


In [24]:
ridge_reg.coef_

array([ 2.74565399e+05,  1.43900376e+00,  3.48784768e+01, -3.08679934e+04,
       -8.54802356e+04,  1.30784473e+05,  3.79897031e+04,  3.01027203e+00,
        1.28807037e+05, -6.43936116e+04, -2.20191147e+04,  1.41225935e+05,
        6.01541283e+04,  1.36604921e+05,  6.82450737e+04, -4.48758586e+04,
       -1.18261081e+05, -1.40375061e+04,  9.47702189e+04, -5.29144382e+04,
       -4.41704919e+04, -4.98005912e+04,  2.40852122e+04, -3.83936672e+04,
       -1.58353300e+04, -1.68201754e+05,  8.08493825e+03, -6.27880702e+04,
        1.48801899e+05,  3.01077775e+04,  2.59806529e+04, -1.38083800e+04,
        1.17008911e+04,  9.42462899e+02,  7.84154905e+04, -6.16676997e+04,
        3.86961098e+04, -5.72485218e+04,  4.67434700e+04,  6.78005371e+04,
        7.05287853e+04, -3.93760752e+04, -2.74948996e+04,  2.49850015e+04,
        2.99162838e+04,  0.00000000e+00,  1.14984883e+05, -7.42920919e+04,
       -1.36014255e+04,  3.69878814e+05, -4.72146099e+04, -8.03342124e+03,
        2.80914008e+03,  