In [1]:
import pandas as pd
import numpy as np

In [2]:
house_price = [50000,200000,140000,320000,220000,380000] # observations of the target variable 
house_size = [500, 1500, 2500, 3500, 4500, 5500] # predictor or feature 
kitchen_size = [40, 100, 200, 220, 250, 300] # predictor or feature 

In [3]:
house_data = pd.DataFrame({'house_price':house_price, 'house_size': house_size, 'kitchen_size': kitchen_size})
house_data.head()

Unnamed: 0,house_price,house_size,kitchen_size
0,50000,500,40
1,200000,1500,100
2,140000,2500,200
3,320000,3500,220
4,220000,4500,250


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
features = ['house_size', 'kitchen_size'] # features names
X = house_data[features] 
Y = house_data['house_price']

In [6]:
X.head()

Unnamed: 0,house_size,kitchen_size
0,500,40
1,1500,100
2,2500,200
3,3500,220
4,4500,250


In [7]:
Y.head()

0     50000
1    200000
2    140000
3    320000
4    220000
Name: house_price, dtype: int64

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [9]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets.
    
    Quick utility that wraps input validation,
    ``next(ShuffleSplit().split(X, y))``, and application to input data
    into a single call for splitting (and optionally subsampling) data into a
    one-liner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
my_linear_regression = LinearRegression()
my_linear_regression.fit(X_train, Y_train)

In [13]:
print("R^2 =", my_linear_regression.score(X_train, Y_train))

R^2 = 0.9379131934463928


In [16]:
my_linear_regression.coef_

array([-311.42857143, 6619.04761905])

In [17]:
print('The intercept (b_0) is  {}'.format(my_linear_regression.intercept_))
print('The coef (b_1) is  {}'.format(my_linear_regression.coef_[0]))
print('The coef (b_2) is  {}'.format(my_linear_regression.coef_[1]))

The intercept (b_0) is  -33333.333333333605
The coef (b_1) is  -311.4285714285717
The coef (b_2) is  6619.047619047625


$y = -33333.33 - 311.42x_1 + 6619.047x_2$

In [15]:
house_price_pred = my_linear_regression.predict(X_test)
house_price_pred 

array([239523.80952381, 511904.76190476])

In [18]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, house_price_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, house_price_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, house_price_pred)))

Mean Absolute Error: 256190.4761904763
Mean Squared Error: 79023356009.07037
Root Mean Squared Error: 281110.93185621646


In [21]:
def MAPE(Y, pred):
  l1 = list(Y)
  l2 = list(pred)
  er = []
  for i in range(len(l1)):
    e = np.abs(l1[i]-l2[i])
    er.append(e/l1[i])
  return np.mean(er)

In [20]:
print('MAPE:', MAPE(Y_test, house_price_pred)*100, "%")

MAPE: 151.30683852488374 %


The MAPE percent shows that the old data there is a high enough percentage that the data is correct, however importing any of the new data will cause problems and likely not be accurate with the numbers.
