In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [None]:
# import dataset
USAhousing = pd.read_csv('USA_Housing.csv')
USAhousing.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [None]:
USAhousing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [None]:
USAhousing.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [None]:
USAhousing.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

#### **Exploratory Data Analysis (EDA)**

In [None]:
sns.pairplot(USAhousing)

<seaborn.axisgrid.PairGrid at 0x7fbf27f25ac0>

In [None]:
sns.displot(USAhousing['Price'])

<seaborn.axisgrid.FacetGrid at 0x7fbf3615aa00>

In [None]:
sns.heatmap(USAhousing.corr(),annot=True)

<AxesSubplot:>

#### Training a Linear Regression Model

**X and y arrays**

In [None]:
X = USAhousing[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms','Area Population']]
y = USAhousing['Price']

**Train test split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model,X,y,cv=10)
    return pred.mean()

def print_evaluate(true,predicted):
    mae = metrics.mean_absolute_error(true,predicted)
    mse = metrics.mean_squared_error(true,predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true,predicted))
    r2_square = metrics.r2_score(true,predicted)

    print('Mean Absulute Error: ',mae)
    print('Mean Squared Error: ',mse)
    print('Root Mean Squared Error: ',rmse)
    print('R2 Square: ',r2_square)
    print('___________________________')

def evaluate(true,predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

**Preparing Data For Linear Regression**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(normalize = True)
lin_reg.fit(X_train,y_train)

LinearRegression(normalize=True)

In [None]:
print(lin_reg.intercept_)

1228219.1492415662


In [None]:
coeff_df = pd.DataFrame(lin_reg.coef_,X.columns,columns=['Co-efficient'])
coeff_df

Unnamed: 0,Co-efficient
Avg. Area Income,232679.724643
Avg. Area House Age,163841.046593
Avg. Area Number of Rooms,121110.555478
Avg. Area Number of Bedrooms,2892.815119
Area Population,151252.342377


**Now prediction out model**

In [None]:
pred = lin_reg.predict(X_test)
pred

array([1308536.13592601, 1237122.72746459, 1243835.62817083, ..., 1457119.79297222, 1483428.953093  , 1047510.59737207])

In [None]:
plt.scatter(y_test,pred)

<matplotlib.collections.PathCollection at 0x7fbf248fa2b0>

In [None]:
sns.displot((y_test - pred), bins =50)

<seaborn.axisgrid.FacetGrid at 0x7fbf34569f40>