In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("boston_housing.csv")

In [3]:
df.shape #dimensions of dataset

(506, 14)

In [4]:
df.describe() #stats like mean, min, max, etc

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [5]:
df.head() #dataset features and attribute information

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [6]:
df_x = df.drop(columns=["price"])
df_y = df["price"]

In [7]:
#Initialize the linear regression model
reg = linear_model.LinearRegression()

In [8]:
#Split the data into 67% and 33% testing data
#We split the dependent variables (x) and the target or independent variable (y)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.33, random_state=42)
# random state ensures that the split is reproducible, meaning that the same split will be obtained every time the code is run with this seed value. Just a convention to use no. 42

In [9]:
#Train our model with the training data 
reg.fit(x_train, y_train)

In [10]:
#Print the coefficients/ weights for each feature/column of our model 
print(reg.coef_) #weights for each feature
#A higher absolute value of a coefficient indicates a stronger influence of that feature on the target variable. Positive coefficients suggest a direct relationship, where an increase in the feature value leads to an increase in the target variable, while negative coefficients indicate an inverse relationship.

[-1.28749718e-01  3.78232228e-02  5.82109233e-02  3.23866812e+00
 -1.61698120e+01  3.90205116e+00 -1.28507825e-02 -1.42222430e+00
  2.34853915e-01 -8.21331947e-03 -9.28722459e-01  1.17695921e-02
 -5.47566338e-01]


In [11]:
#print our price predicitions on our test data 
y_pred = reg.predict(x_test)
print(y_pred)

[28.53469469 36.6187006  15.63751079 25.5014496  18.7096734  23.16471591
 17.31011035 14.07736367 23.01064388 20.54223482 24.91632351 18.41098052
 -6.52079687 21.83372604 19.14903064 26.0587322  20.30232625  5.74943567
 40.33137811 17.45791446 27.47486665 30.2170757  10.80555625 23.87721728
 17.99492211 16.02608791 23.268288   14.36825207 22.38116971 19.3092068
 22.17284576 25.05925441 25.13780726 18.46730198 16.60405712 17.46564046
 30.71367733 20.05106788 23.9897768  24.94322408 13.97945355 31.64706967
 42.48057206 17.70042814 26.92507869 17.15897719 13.68918087 26.14924245
 20.2782306  29.99003492 21.21260347 34.03649185 15.41837553 25.95781061
 39.13897274 22.96118424 18.80310558 33.07865362 24.74384155 12.83640958
 22.41963398 30.64804979 31.59567111 16.34088197 20.9504304  16.70145875
 20.23215646 26.1437865  31.12160889 11.89762768 20.45432404 27.48356359
 10.89034224 16.77707214 24.02593714  5.44691807 21.35152331 41.27267175
 18.13447647  9.8012101  21.24024342 13.02644969 21.

In [12]:
#print the predicted price and actual price of houses from the testing data set row 0
print(y_pred[0])


print(y_test[0])

28.534694689729875
24.0


In [13]:
#Model is pretty good, but to check model's performance and accuracy, we will use mean squared error (MSE)
print(np.mean((y_pred-y_test)**2))


#It measures the average of the squares of the errors, which are the differences between the predicted values (y_pred) and the actual values (y_test).
#By squaring these residuals, we ensure that all error values are positive and that larger errors have a disproportionately higher impact on the final metric.

#Lower values of MSE indicate better model performance. However, it is important to compare the MSE to the range of the target variable.


20.724023437339817
