# Supervised Machine Learning
ML to predict future housing prices in cities listed in DataFrame

In [22]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Retrieve the Data
The cleaned data file is located under the Resources folder and named sale_price_clean.csv

In [23]:
#  Code  below found in Week 19, Day 1 - Activity 1 and 2

In [38]:
# Import the data using Pandas. 
sale_price = "../Resources/Housing.csv"

In [39]:
# Display the resulting DataFrame to confirm the import was successful.
sale_price_df = pd.read_csv(sale_price, encoding="ISO-8859-1")
sale_price_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## Split the Data into Training and Testing Sets

In [40]:
# # Temporarily remove columns that are not needed
# sale_price_df.drop(sale_price_df.columns[[0,2]], axis=1, inplace=True)
# sale_price_df.head()

In [41]:
sale_price_df.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

In [42]:
sale_price_df.shape

(545, 13)

In [43]:
# to see unique categories in a categorical column
# print(sale_price_df.RegionName.unique())

In [44]:
for col in sale_price_df.columns:
    attName = col
    dType = sale_price_df[col].dtype
    missing = pd.isnull(sale_price_df[col]).any()
    uniqueCount = len(sale_price_df[attName].value_counts(normalize=False))
# discretize (create dummies)
    if dType == object:
        sale_price_df = pd.concat([sale_price_df, pd.get_dummies(sale_price_df[col], prefix=col)], axis=1)
        del sale_price_df[attName]


In [45]:
# for col in sale_price_df.columns:
#     if sale_price_df[col].dtype == "object":
#         sale_price_df[col] = pd.to_numeric(sale_price_df[col], errors="coerce")

In [46]:
sale_price_df.dtypes

price                              int64
area                               int64
bedrooms                           int64
bathrooms                          int64
stories                            int64
parking                            int64
mainroad_no                        uint8
mainroad_yes                       uint8
guestroom_no                       uint8
guestroom_yes                      uint8
basement_no                        uint8
basement_yes                       uint8
hotwaterheating_no                 uint8
hotwaterheating_yes                uint8
airconditioning_no                 uint8
airconditioning_yes                uint8
prefarea_no                        uint8
prefarea_yes                       uint8
furnishingstatus_furnished         uint8
furnishingstatus_semi-furnished    uint8
furnishingstatus_unfurnished       uint8
dtype: object

In [47]:
sale_price_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,0,1,1,0,...,0,1,0,0,1,0,1,1,0,0
1,12250000,8960,4,4,4,3,0,1,1,0,...,0,1,0,0,1,1,0,1,0,0
2,12250000,9960,3,2,2,2,0,1,1,0,...,1,1,0,1,0,0,1,0,1,0
3,12215000,7500,4,2,2,3,0,1,1,0,...,1,1,0,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,0,1,0,1,...,1,1,0,0,1,1,0,1,0,0


In [48]:
sale_price_df.shape

(545, 21)

In [49]:
# Define the X (features) and y (target) sets
y = sale_price_df["price"].values
X = sale_price_df.drop("price", axis=1)
# X = pd.DataFrame(X)

In [50]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [54]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(408, 20) (137, 20) (408,) (137,)


## Create, Fit and Compare Models

Create a Linear Regression model, fit it to the data, and print the model's score. We will do the same for KNeighborRegressor,
RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, and SVR.

In [None]:
# Creating the Linear Regression model
model = LinearRegression()
model

In [None]:
# Fit the model to the training data, and calculate the scores for the training and testing data.
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

In [None]:
# Print the training score and test score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
# Plot the residuals for the training and testing data.
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")
plt.show()

## Not sure about the below test models

In [None]:
## Week 19, Day 3 Activity 10

In [None]:
from sklearn.datasets import make_regression, make_swiss_roll
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR

In [None]:
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [None]:
# Create data
X, y = make_regression(random_state=42)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [None]:
test_model(LinearRegression(), data)

test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

In [None]:
# # Create and train a Random Forest Classifier model and print the model score
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# scaler = StandardScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [None]:
# # Validate the model by using the test data
# clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
# print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
# print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
# features = clf.feature_importances_
# print(features)
# plt.bar(x = range(len(features)), height=features)
# plt.show()