## Data Analytics I
Create a Linear Regression Model using Python/R to predict home prices using Boston Housing
Dataset (https://www.kaggle.com/c/boston-housing). The Boston Housing dataset contains
information about various houses in Boston through different parameters. There are 506 samples
and 14 feature variables in this dataset.
The objective is to predict the value of prices of the house using the given features.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets 

In [2]:
df = pd.read_csv('HousingData.csv')

In [3]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      486 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    486 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [5]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,486.0,486.0,486.0,486.0,506.0,506.0,486.0,506.0,506.0,506.0,506.0,506.0,486.0,506.0
mean,3.611874,11.211934,11.083992,0.069959,0.554695,6.284634,68.518519,3.795043,9.549407,408.237154,18.455534,356.674032,12.715432,22.532806
std,8.720192,23.388876,6.835896,0.25534,0.115878,0.702617,27.999513,2.10571,8.707259,168.537116,2.164946,91.294864,7.155871,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.0819,0.0,5.19,0.0,0.449,5.8855,45.175,2.100175,4.0,279.0,17.4,375.3775,7.125,17.025
50%,0.253715,0.0,9.69,0.0,0.538,6.2085,76.8,3.20745,5.0,330.0,19.05,391.44,11.43,21.2
75%,3.560263,12.5,18.1,0.0,0.624,6.6235,93.975,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [6]:
df.keys()

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [7]:
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [8]:
df['CRIM'].fillna(df['CRIM'].mean(),inplace=True)
df['ZN'].fillna(df['ZN'].mean(),inplace=True)
df['INDUS'].fillna(df['INDUS'].mean(),inplace=True)
df['CHAS'].fillna(df['CHAS'].mean(),inplace=True)
df['AGE'].fillna(df['AGE'].mean(),inplace=True)
df['LSTAT'].fillna(df['LSTAT'].mean(),inplace=True)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df.iloc[:,0:13]
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.200000,4.0900,1,296,15.3,396.90,4.980000
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.900000,4.9671,2,242,17.8,396.90,9.140000
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.100000,4.9671,2,242,17.8,392.83,4.030000
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.800000,6.0622,3,222,18.7,394.63,2.940000
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.200000,6.0622,3,222,18.7,396.90,12.715432
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.100000,2.4786,1,273,21.0,391.99,12.715432
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.700000,2.2875,1,273,21.0,396.90,9.080000
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.000000,2.1675,1,273,21.0,396.90,5.640000
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.300000,2.3889,1,273,21.0,393.45,6.480000


In [11]:
y = df.iloc[:,-1]
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

In [12]:
# X_train, X_test, y_train, y_test = train_test_split(
# ...     X, y, test_size=0.33, random_state=42)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 20)

In [13]:
x_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
7,0.14455,12.500000,7.87,0.000000,0.524,6.172,96.100000,5.9505,5,311,15.2,396.90,19.150000
268,0.54050,20.000000,3.97,0.000000,0.575,7.470,52.600000,2.8720,5,264,13.0,390.30,3.160000
80,0.04113,25.000000,4.86,0.000000,0.426,6.727,33.500000,5.4007,4,281,19.0,396.90,5.290000
30,1.13081,0.000000,8.14,0.000000,0.538,5.713,94.100000,4.2330,4,307,21.0,360.17,22.600000
450,6.71772,0.000000,18.10,0.069959,0.713,6.749,92.600000,2.3236,24,666,20.2,0.32,17.440000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,18.81100,0.000000,18.10,0.000000,0.597,4.628,100.000000,1.5539,24,666,20.2,28.79,34.370000
223,0.61470,0.000000,6.20,0.000000,0.507,6.618,80.800000,3.2721,8,307,17.4,396.90,7.600000
271,0.16211,20.000000,6.96,0.000000,0.464,6.240,16.300000,4.4290,3,223,18.6,396.90,12.715432
474,8.05579,0.000000,18.10,0.000000,0.584,5.427,95.400000,2.4298,24,666,20.2,352.58,18.140000


In [14]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model = lr.fit(x_train, y_train)

In [15]:
y_pred = model.predict(x_test) 

In [16]:
 for i, j in zip(y_test, y_pred):
    print(i,"  ", j)

21.2    21.268392513467774
20.6    26.993043557763727
21.5    19.805507879815604
21.7    24.558867639930895
13.4    12.955139233457217
20.4    20.007340519469473
20.0    18.161464595348797
5.0    7.315351613615029
14.5    18.761472701977038
24.0    25.138769716415904
36.2    27.65564593531179
23.1    16.675144326183393
23.0    23.963828135424055
21.0    21.585888968573617
19.5    20.315669032410746
24.7    25.20548579352954
32.2    31.522649456302844
14.6    19.090509314905624
14.3    14.355863897662829
17.5    17.486835597442393
37.6    32.850089313063684
50.0    42.46174508421991
22.6    27.40511853288353
30.3    32.67655886446136
23.3    26.97362912020617
22.1    26.766801291396312
18.2    20.125495209289817
23.1    25.18435432651077
21.0    21.491743209748716
21.9    24.59707661932507
27.5    20.81517096493789
19.9    17.354186858058707
20.3    22.19481821001884
7.2    9.928626958549941
29.4    30.511108615424185
18.6    20.48890254919555
50.0    42.70977287609262
22.2    22.475021

In [17]:
from sklearn.metrics import mean_squared_error,accuracy_score
mse = mean_squared_error(y_test, y_pred)

In [18]:
mse

17.49004909694823

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 20)

In [20]:
x_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
388,14.33370,0.000000,18.10,0.069959,0.700,4.880,100.000000,1.5895,24,666,20.2,372.92,30.620000
424,8.79212,0.000000,18.10,0.000000,0.584,5.565,70.600000,2.0635,24,666,20.2,3.65,17.160000
45,0.17142,0.000000,6.91,0.000000,0.448,5.682,33.800000,5.1004,3,233,17.9,396.90,10.210000
470,4.34879,0.000000,18.10,0.000000,0.580,6.167,84.000000,3.0334,24,666,20.2,396.90,16.290000
28,0.77299,0.000000,8.14,0.000000,0.538,6.495,94.400000,4.4547,4,307,21.0,387.94,12.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,18.81100,0.000000,18.10,0.000000,0.597,4.628,100.000000,1.5539,24,666,20.2,28.79,34.370000
223,0.61470,0.000000,6.20,0.000000,0.507,6.618,80.800000,3.2721,8,307,17.4,396.90,7.600000
271,0.16211,20.000000,6.96,0.000000,0.464,6.240,16.300000,4.4290,3,223,18.6,396.90,12.715432
474,8.05579,0.000000,18.10,0.000000,0.584,5.427,95.400000,2.4298,24,666,20.2,352.58,18.140000


In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lr = LinearRegression()

In [23]:
lr.fit(x_train,y_train)

In [24]:
y_pred = lr.predict(x_test)

In [25]:
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
mse = mean_squared_error(y_pred,y_test)
rmse=np.sqrt(mse)
msa = mean_absolute_error(y_pred,y_test)
r2 = r2_score(y_pred,y_test)

In [26]:
mse

26.82513703251621

In [27]:
msa

3.4891412466083067

In [28]:
rmse

5.179298893915682

In [29]:
r2

0.4450892770217184