# Linear Regression

In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Housing Data

In [8]:
# reading the data
house1ft = pd.read_csv('House_1feature-1.csv')
house1ft.head(3)

Unnamed: 0,Square_feet,Price
0,1349,147703
1,897,178239
2,660,160805


In [10]:
# creating features and target sets
x_house1, y_house1 = house1ft[['Square_feet']], house1ft['Price']
display(x_house1.head(2))
display(y_house1.head(2))

Unnamed: 0,Square_feet
0,1349
1,897


0    147703
1    178239
Name: Price, dtype: int64

In [31]:
# split to train and test
x_train, x_test, y_train, y_test = train_test_split(x_house1, y_house1, random_state=32)

In [16]:
# check our work
print('initial features set size:', x_house1.shape)
print("x_train shape:", x_train.shape)
print("x_train shape:", x_test.shape)

print("y_train shape:", y_train.shape)
print("y_train shape:", y_test.shape)

initial features set size: (36, 1)
x_train shape: (27, 1)
x_train shape: (9, 1)
y_train shape: (27,)
y_train shape: (9,)


In [18]:
# define the model
lr = LinearRegression()
# fit the model
lr.fit(x_train, y_train)

LinearRegression()

In [19]:
# intercept value
lr.intercept_

107137.65647029472

In [20]:
# coefficient value
lr.coef_

array([71.92203044])

In [21]:
# regression equation
# price = 107137.66 + 71.92(square_feet)

In [22]:
# prediction
h1 = [1000]
h2 = [3000]
lr.predict([h1,h2])



array([179059.68691461, 322903.74780325])

In [23]:
# use the equation to predict h1 price
107137.656 + 71.922*1000

179059.65600000002

In [25]:
print('LR acc on train: {:.3f}'.format(lr.score(x_train, y_train)))
print('LR acc on test: {:.3f}'.format(lr.score(x_test, y_test)))

LR acc on train: 0.763
LR acc on test: 0.682


## Boston Housing Data

In [28]:
# reading the data
boston = pd.read_csv("boston_housing_data-1.csv", index_col=0)
boston.head(5)

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,target_medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,12.43,22.9


In [29]:
# creating features and target sets
# target_medv is target, everything else is a feature
x_boston, y_boston = boston.iloc[:,:-1], boston['target_medv']
display(x_boston.head(3))
display(y_boston.head(3))

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94


ID
1    24.0
2    21.6
4    33.4
Name: target_medv, dtype: float64

In [32]:
# split to train and test
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_boston, y_boston, random_state=0)

In [33]:
# check our work
print('initial features set size:', x_boston.shape)
print("x_train shape:", x_train1.shape)
print("x_train shape:", x_test1.shape)

print("y_train shape:", y_train1.shape)
print("y_train shape:", y_test1.shape)

initial features set size: (333, 12)
x_train shape: (249, 12)
x_train shape: (84, 12)
y_train shape: (249,)
y_train shape: (84,)


In [34]:
# define the model
lr1 = LinearRegression()
# fit the model
lr1.fit(x_train1, y_train1)

LinearRegression()

In [35]:
# intercept value
lr1.intercept_

41.72463933066163

In [36]:
# coefficient value
lr1.coef_

array([-1.83853404e-01,  3.40723088e-02,  9.16637921e-02,  4.36098629e+00,
       -2.06471577e+01,  3.36616016e+00,  7.04582792e-03, -1.42586713e+00,
        3.41928111e-01, -1.23677528e-02, -7.83100065e-01, -6.93085300e-01])

In [None]:
# prediction
b1 = [1000]
b2 = [3000]
lr.predict([h1,h2])

In [38]:
print('LR acc on train: {:.3f}'.format(lr1.score(x_train1, y_train1)))
print('LR acc on test: {:.3f}'.format(lr1.score(x_test1, y_test1)))

LR acc on train: 0.730
LR acc on test: 0.670
