# Loading Dataset & Quick Overview
   * sklearn 波士頓房價預測資料集

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 15
# 讀取檔案
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [None]:
print(data.shape)  # data ==> Features
print(target.shape)  # target ==> Label

# Plot Features V.S. Y
Can you explain the relation between other features with house prices?

In [None]:
import math

features = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT'
]

fig, axes = plt.subplots(4, 4, figsize=(16, 14))
for i in range(13):
    x_f = i % 4
    y_f = math.floor((i) / 4)
    axes[x_f, y_f].scatter(data[:, i], target)
    axes[x_f, y_f].set_xlabel(features[i])
    axes[x_f, y_f].set_ylabel('prices')
plt.show()
"""
- CRIM     per capita crime rate by town
- ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS    proportion of non-retail business acres per town
- CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX      nitric oxides concentration (parts per 10 million)
- RM       average number of rooms per dwelling
- AGE      proportion of owner-occupied units built prior to 1940
- DIS      weighted distances to five Boston employment centres
- RAD      index of accessibility to radial highways
- TAX      full-value property-tax rate per $10,000
- PTRATIO  pupil-teacher ratio by town
- B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT    % lower status of the population
"""
pass

# Predicting Home Prices: Simple Linear Regression

In [None]:
'''Perform any data transformation here'''
# For example:
data = [
    np.hstack([data[i],
               np.sqrt(data[i]),
               np.log(data[i] + 1)]) for i in range(data.data.shape[0])
]

In [None]:
X = data
y = target

In [None]:
# import needed models in scikit-learn by yourself
# and fit a linear model using training dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42,
                                                    shuffle=True)

model = LinearRegression()
# model = '''train a linear model using training dataset here'''
model.fit(X_train, y_train)

In [None]:
# make prediction here
# y_pred = '''make prediction on testing dataset here'''
y_pred = model.predict(X_test)

In [None]:
# visualize the result
%matplotlib inline
plt.scatter(y_test, y_pred)
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')
print("RMS:", np.sqrt(np.mean((y_pred - y_test)**2)))
print('R2:%.2f' % metrics.r2_score(y_pred, y_test))

1.Try normalization 
2.Try other transformation
3.Use Lasso Regularization to see feature importance

In [None]:
# import needed models in scikit-learn by yourself
# and fit a linear model using training dataset
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.0001)
# model = '''train a linear model using training dataset here'''
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics
y_pred = model.predict(X_test)
# visualize the result
%matplotlib inline
plt.scatter(y_test, y_pred)
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')
print("RMS:%.2f" % np.sqrt(np.mean((y_pred - y_test)**2)))
print('R2:%.2f' % metrics.r2_score(y_pred, y_test))

In [None]:
type(metrics.r2_score(y_pred, y_test))

In [None]:
print(' R2:%.2f' % metrics.r2_score(y_pred, y_test))