In [49]:
#Import Required Libraries
import numpy as np  
import pandas as pd  
import warnings
warnings.filterwarnings("ignore")  # Suppress all warnings

from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression  
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [50]:
# Load Boston dataset from online source
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# Reshape the dataset
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# Convert to DataFrame
columns = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"
]
df = pd.DataFrame(data, columns=columns)
df["PRICE"] = target  # Add the target variable

# Display first 5 rows
df.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [51]:
df.shape

(506, 14)

In [52]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
PRICE      0
dtype: int64

In [73]:
# Define independent (X) and dependent (y) variables
X = df.drop(columns=['PRICE'])  # Features
y = df['PRICE']  # Target variable


In [79]:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [81]:
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: PRICE, Length: 506, dtype: float64

In [75]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Create model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)


In [67]:
# Model evaluation
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


Mean Absolute Error (MAE): 3.189091965887852
Mean Squared Error (MSE): 24.291119474973613
Root Mean Squared Error (RMSE): 4.928602182665346
R² Score: 0.6687594935356307


In [69]:
# Predict Salary for test data
y_pred = model.predict(X_test)

# Compare actual vs predicted
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results

Unnamed: 0,Actual,Predicted
173,23.6,28.996724
274,32.4,36.025565
491,13.6,14.816944
72,22.8,25.031979
452,16.1,18.769880
...,...,...
412,17.9,-0.164237
436,9.6,13.684867
411,17.2,16.183597
86,22.5,22.276220


In [None]:
'''
Explanation

1) sklearn
sklearn, short for scikit-learn, is a powerful machine learning library in Python.
It provides easy-to-use tools for:
Building ML models (like Linear Regression, Decision Trees, etc.)
Splitting data
Measuring model performance
Preprocessing data (scaling, encoding, etc.)

2) Mean Absolute Error (MAE)
📌 Average of absolute differences between actual and predicted values.
✅ Lower = better accuracy.
🧠 "On average, how wrong are the predictions?"

3) Mean Squared Error (MSE)
📌 Average of squared differences between actual and predicted values.
✅ Penalizes larger errors more than MAE.
🧠 "How far off are predictions, especially big mistakes?"

4) Root Mean Squared Error (RMSE)
📌 Square root of MSE.
✅ Same units as original data.
🧠 "Typical size of the error."

5) R² Score (R-squared)
📌 Tells how well the model explains the variance in the data.
Range: 0 to 1 (sometimes negative if the model is very bad)
✅ Closer to 1 = better fit
🧠 "How well does my model explain the actual data?"
'''