In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Import libraries
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
# Load the data with only two features
boston = load_boston()
X = boston.data
y = boston.target
boston.DESCR


".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000

In [5]:
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [6]:
# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state = 29)

In [8]:
# Create interaction term (not polynomial features)
interaction = PolynomialFeatures(degree = 3, include_bias = False, interaction_only = True)
X_inter = interaction.fit_transform(X_train)

In [9]:
from sklearn.pipeline import make_pipeline

for k in range(1,8):
    poly_model = make_pipeline (StandardScaler(), PolynomialFeatures(k), LinearRegression())
    model = poly_model.fit(X, y)

    print(poly_model.score(X_test, y_test))

-4.339159149218074
-1.993858894877324e+25
-3.104933188695576e+23
-62225841.34991959
-1503812305.7216866
-90348099907.37917
-10904335321237.502


In [10]:
X.shape

(506, 13)

In [11]:
X_inter.shape

(379, 377)

In [13]:
# Create linear regression
regr = LinearRegression()

# Fit the linear regression
model = regr.fit(X_inter, y_train)
#model = regr.fit(X_inter, y)

In [19]:
X_test.shape

(127, 13)

In [14]:
model.score(X_test, y_test)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 377 is different from 13)

In [None]:
## Now vary the degree of PolynomialFeatures to have a better score

## Decision Tree Regression

In [None]:
# Import libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets

In [None]:
# Load data with only two features
boston = datasets.load_boston()
X = boston.data
y = boston.target

In [None]:
boston_column = list(boston['feature_names'])
print(boston.DESCR)

In [None]:
# Create decision tree classifer object
regr = DecisionTreeRegressor(random_state = 29)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 29)

In [None]:
# Train model
model = regr.fit(X_train, y_train)

In [None]:
# Predict observation's value  
regr.score(X_test, y_test)

In [None]:
# Plot y_pred and y_test, see the difference

In [None]:
from sklearn.tree.export import export_text

r = export_text(regr, feature_names=list(boston.feature_names))
print(r)

## Knn Regression

In [None]:
## Import libraries
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Load data with only two features
boston = datasets.load_boston()
X = boston.data
y = boston.target

In [None]:
knnr = KNeighborsRegressor(n_neighbors = 3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 29)

In [None]:
model = knnr.fit(X_train, y_train)  #fit the model
y_pred = knnr.predict(X_test)

## MAE

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

## MSE

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

## R^2

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)