## Getting the data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('../input/bostonhoustingmlnd/housing.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

## Checking if any column has null values

In [None]:
data.isnull().values.any()

In [None]:
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

## Exploratory Data Analysis

In [None]:
sns.set_palette("GnBu_d")
sns.set_style("whitegrid")

In [None]:
sns.pairplot(data,height=2)

In [None]:
sns.heatmap(data.corr(),annot= True)

* #### RM looks to be the most correlated feature with MEDV

In [None]:
sns.lmplot(x= 'RM',y= 'MEDV',data= data)

## Training and Testing Data

In [None]:
data.columns

In [None]:
X = data[['RM', 'LSTAT', 'PTRATIO']]

In [None]:
y = data['MEDV']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 101)

## Training the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
print('Intercept: \n', lm.intercept_)

In [None]:
print('Coefficients: \n', lm.coef_)

## Predicting Test Data

In [None]:
predictions = lm.predict(X_test)

* #### Create a scatterplot of the real test values versus the predicted values

In [None]:
sns.scatterplot(y_test,predictions)
plt.title('Y Test Vs Predicted Y')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

## Evaluating the Model

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,predictions))

In [None]:
print('MSE:', metrics.mean_squared_error(y_test,predictions))

In [None]:
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,predictions)))

In [None]:
metrics.explained_variance_score(y_test,predictions)

* #### Residuals

In [None]:
sns.distplot((y_test - predictions))

In [None]:
data_coeff = pd.DataFrame(lm.coef_, X.columns, columns= ['Coeffecient'])
data_coeff