In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
USAhousing = pd.read_csv('../input/usa-housing/USA_Housing.csv')

In [None]:
USAhousing.head()

In [None]:
USAhousing.info()

In [None]:
USAhousing.describe()

In [None]:
USAhousing.columns

In [None]:
USAhousing.isna().sum()

There is no missing value, so we are good to go.

## EDA

In [None]:
sns.pairplot(USAhousing)

In [None]:
sns.distplot(USAhousing['Price']) 

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(USAhousing.corr(), annot=True)

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(2,3,1)
plt.title('Area Population')
plt.boxplot(USAhousing['Area Population'])

plt.subplot(2,3,2)
plt.title('Average Area Income')
plt.boxplot(USAhousing['Avg. Area Income'])

plt.subplot(2,3,3)
plt.title('Average Area House Age')
plt.boxplot(USAhousing['Avg. Area House Age'])

plt.subplot(2,3,4)
plt.title('Average Area Number of Rooms')
plt.boxplot(USAhousing['Avg. Area Number of Rooms'])

plt.subplot(2,3,5)
plt.title('Average Area Number of Bedrooms')
plt.boxplot(USAhousing['Avg. Area Number of Bedrooms'])

plt.subplot(2,3,6)
plt.title('Price')
plt.boxplot(USAhousing['Price'])

## Linear Regression Model

We will be using linear regression model to predict the price. Price is our target variable and all other columns except Address are the predictors. 

We will not be using the Address column because it has text info that the linear regression model can't use.

In [None]:
X = USAhousing.drop(['Address', 'Price'], axis = 1)
y = USAhousing['Price']

## Train Test Split

We will be spliting the data into training(70%) and testing(30%) data sets. Training data will be used to train the model and test data will be used to evaluate the model.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Creating and Training the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

## Model Evaluation

We can evaluate the model by checking its coefficients and key metrics.

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

Interpreting the coefficients:

- If there is 1 unit increase in **Avg. Area Income** and no change in value of other features, then the price will **increase by \$21.62 **.
- If there is 1 unit increase in **Avg. Area House Age** and no change in value of other features, then the price will **increase by \$165221.12 **.
- If there is 1 unit increase in **Avg. Area Number of Rooms** and no change in value of other features, then the price will **increase by \$121405.38 **.
- If there is 1 unit increase in **Avg. Area Number of Bedrooms** and no change in value of other features, then the price will **increase by \$1318.72 **.
- If there is 1 unit increase in **Area Population** and no change in value of other features, then the price will **increase by \$15.23 **.

Does this make sense? Probably not. It could be due to following reasons. We will not be going in detail into them as we are trying a simple linear regression model here. because this data was made up.

- This data might be mocked one.
- Even if it is real data, we don't have enough features and volume of data to train the model 

## Predictions from Model

We are going to predict the price for the test data set and will compare with actual price to see how well our model did.

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
sns.distplot((y_test-predictions),bins=50);

## Regression Evaluation Metrics

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('MSLE:', metrics.mean_squared_log_error(y_test, predictions))
print('RMSLE:', np.sqrt(metrics.mean_squared_log_error(y_test, predictions)))

In [None]:
print('R Square:', metrics.r2_score(y_test, predictions))