In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score

import statsmodels.api as sm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Step 1: Reading Data**

In [None]:
#read the data

df= pd.read_csv(r'/kaggle/input/usa-housing/USA_Housing.csv')

In [None]:
#check data
df.head()

In [None]:
#check shape of data
df.shape

In [None]:
#info of dataframe
df.info()

In [None]:
df.describe()

# Step 2: Exploratory Data Analysis (EDA)

In [None]:
#plotting a pairplot

sns.pairplot(df, diag_kind='kde')

In [None]:
# plotting heat map for correlations

sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
#dist plot for dependent variable
sns.distplot(df['Price'])

# Step 3: Splitting the Data into Training and Testing Sets

In [None]:
#selecting X and y for Model
X = df.drop(['Address', 'Price'], axis=1)
y = df['Price']

In [None]:
#dividing data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=0)

In [None]:
# train data
X_train.head()

In [None]:
#train data shape
X_train.shape

In [None]:
#test data shape
X_test.shape

# Step 4: Building the Model

In [None]:
#build linear model

#add a constant
X_train_sm = sm.add_constant(X_train)

#model creation
lr= sm.OLS(y_train, X_train_sm)

In [None]:
#fit the model
lr_model= lr.fit()

In [None]:
#params
lr_model.params

In [None]:
#summary
lr_model.summary()

* Here p value of 'Avg. Area Number of Bedrooms' is very high, therefore we are dropping it.

In [None]:
#dropping 'Avg. Area Number of Bedrooms' as it has high p value. 
X_train= X_train.drop(columns=['Avg. Area Number of Bedrooms'])

In [None]:
#build linear model

#add a constant
X_train_sm = sm.add_constant(X_train)

#model creation
lr= sm.OLS(y_train, X_train_sm)

In [None]:
#fit the model
lr_model= lr.fit()

In [None]:
#params
lr_model.params

In [None]:
#summary
lr_model.summary()

In [None]:
#building model using sklearn

lm= LinearRegression()
lm.fit(X_train, y_train)

# Step 5: Residual analysis

In [None]:
#predict y_train
y_train_cnt = lm.predict(X_train)

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_cnt), bins = 20)
# Plot heading 
fig.suptitle('Error Terms', fontsize = 20) 
# X-label
plt.xlabel('Errors', fontsize = 18)    
plt.show()

# Step 6: Making Predictions on Test Set

In [None]:
#preparing X_test data

X_test= X_test.drop(columns=['Avg. Area Number of Bedrooms'])

In [None]:
#X_test data

X_test.head()

In [None]:
# Making predictions using the model

y_pred = lm.predict(X_test)

# Step 7: Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure(figsize=(8, 6))
plt.scatter(y_test,y_pred)
# Plot heading 
fig.suptitle('y_test vs y_pred', fontsize=20)
# X-label
plt.xlabel('y_test', fontsize=18)
# Y-label
plt.ylabel('y_pred', fontsize=16)
plt.show()

In [None]:
#r-square train
r2_score(y_true= y_train, y_pred= y_train_cnt)

In [None]:
#r-square test
r2_score(y_true= y_test, y_pred= y_pred)

In [None]:
#'MAE', 'MSE', 'RMSE' of the model.

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

**Here R2 score of train and test data is almost same. Therefore we can say that model is not overfitted.**