In [2]:
from sklearn.datasets import load_boston

In [3]:
import cudf, pandas as pd
boston_dataset = load_boston()
print(boston_dataset.DESCR)
print(str(boston_dataset.feature_names))

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
data = boston_dataset.data

d = {name:data[:,i] for i, name in zip(range(data.shape[1]), boston_dataset.feature_names)}

boston = cudf.DataFrame.from_pandas(pd.DataFrame(d))

In [5]:
print(str(boston))

     AGE      B CHAS    CRIM    DIS INDUS LSTAT ...   ZN
 0  65.2  396.9  0.0 0.00632   4.09  2.31  4.98 ... 18.0
 1  78.9  396.9  0.0 0.02731 4.9671  7.07  9.14 ...  0.0
 2  61.1 392.83  0.0 0.02729 4.9671  7.07  4.03 ...  0.0
 3  45.8 394.63  0.0 0.03237 6.0622  2.18  2.94 ...  0.0
 4  54.2  396.9  0.0 0.06905 6.0622  2.18  5.33 ...  0.0
 5  58.7 394.12  0.0 0.02985 6.0622  2.18  5.21 ...  0.0
 6  66.6  395.6  0.0 0.08829 5.5605  7.87 12.43 ... 12.5
 7  96.1  396.9  0.0 0.14455 5.9505  7.87 19.15 ... 12.5
 8 100.0 386.63  0.0 0.21124 6.0821  7.87 29.93 ... 12.5
 9  85.9 386.71  0.0 0.17004 6.5921  7.87  17.1 ... 12.5
[496 more rows]
[5 more columns]


In [6]:
X = boston[['LSTAT','RM']]
Y = cudf.DataFrame({"MEDV": boston_dataset.target})

In [7]:
Y = cudf.DataFrame({"MEDV": boston_dataset.target})

In [8]:
from cuml import LinearRegression

In [10]:
print(str(Y))

   MEDV
 0 24.0
 1 21.6
 2 34.7
 3 33.4
 4 36.2
 5 28.7
 6 22.9
 7 27.1
 8 16.5
 9 18.9
[496 more rows]


In [11]:
linear_regression = LinearRegression()

In [12]:
linear_regression.fit(X, Y["MEDV"])

<cuml.LinearRegression at 0x7fcd06272438>

In [14]:
linear_regression.predict(X)

<cudf.Series nrows=506 >

In [15]:
print(str(linear_regression.predict(X)))

                     
 0 28.941013680602516
 1 25.484205660559113
 2  32.65907476857973
 3   32.4065199998349
 4  31.63040699065758
 5 28.054527005997567
 6  21.28707845530228
 7  17.78559652667558
 8    8.1046933839978
 9   18.2465067305075
[496 more rows]


In [16]:
print(str(Y["MEDV"]))

       
 0 24.0
 1 21.6
 2 34.7
 3 33.4
 4 36.2
 5 28.7
 6 22.9
 7 27.1
 8 16.5
 9 18.9
[496 more rows]


In [17]:
df = cudf.DataFrame()
df["Y_hat"] = linear_regression.predict(X)

In [18]:
df["Y"] = Y["MEDV"]

In [19]:
split_percentage = 0.75
import math
train_size = math.ceil(len(df) * split_percentage)

In [20]:
train_size

380

In [34]:
X = boston[['LSTAT','RM']]

Y = cudf.DataFrame({"MEDV": boston_dataset.target})

split_percentage = 0.75
train_size = math.ceil(len(boston) * split_percentage)

X_train, Y_train = (X[:train_size], Y[:train_size])
X_test, Y_test = (X[train_size:], Y[train_size:])

In [35]:
model = LinearRegression()
model.fit(X_train, Y_train["MEDV"])

<cuml.LinearRegression at 0x7fcd0641e5c0>

In [36]:
Y_hat = model.predict(X_test)

In [37]:
print(str(Y_hat))

                     
 0 26.160110846630335
 1   21.5403691821006
 2 13.516103150305451
 3 12.978224633672557
 4 2.3916772973291565
 5   8.52707063097034
 6  5.382526612267309
 7  6.106285901332168
 8  5.897190670212712
 9  13.74514684800808
[116 more rows]


In [38]:
df = cudf.DataFrame()
df["Y"] = Y_test["MEDV"]
df["Y_hat"] = Y_hat

In [43]:
import numpy as np

RMSE = np.sqrt(((df["Y"] - df["Y_hat"])**2).mean())

In [44]:
RMSE

6.653650121923065