# Supervised Machine Learning
ML to predict future housing prices in cities listed in DataFrame

In [12]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Retrieve the Data
The cleaned data file is located under the Resources folder and named sale_price_clean.csv

In [13]:
#  Code  below found in Week 19, Day 1 - Activity 1 and 2

In [14]:
# Import the data using Pandas. 
sale_price = "../Resources/housing_price_pn.csv"

In [15]:
# Display the resulting DataFrame to confirm the import was successful.
sale_price_df = pd.read_csv(sale_price, encoding="ISO-8859-1")
sale_price_df.head()

Unnamed: 0,RegionID,RegionName,StateName,4/30/2008,5/31/2008,6/30/2008,7/31/2008,8/31/2008,9/30/2008,10/31/2008,...,1/31/2022,2/28/2022,3/31/2022,4/30/2022,5/31/2022,6/30/2022,7/31/2022,8/31/2022,9/30/2022,10/31/2022
0,394913,"New York, NY",NY,405000.0,404667.0,408000.0,414667.0,421667.0,421667.0,410000.0,...,537167.0,537167.0,541833.0,546333.0,557667.0,574633.0,592967.0,606633.0,603333.0,591667
1,753899,"Los Angeles, CA",CA,542500.0,532833.0,526917.0,512417.0,500750.0,482333.0,470333.0,...,859167.0,875167.0,898333.0,934833.0,959833.0,964833.0,945000.0,920000.0,901667.0,891667
2,394463,"Chicago, IL",IL,245667.0,247333.0,252833.0,257833.0,261167.0,258333.0,248667.0,...,277833.0,276333.0,280833.0,290833.0,302667.0,313167.0,316500.0,313333.0,303333.0,295000
3,394514,"Dallas, TX",TX,141850.0,144817.0,148317.0,152300.0,154000.0,150000.0,144667.0,...,360667.0,365500.0,373833.0,393833.0,413333.0,425500.0,423833.0,413833.0,402167.0,393833
4,394692,"Houston, TX",TX,144133.0,145966.0,149166.0,152700.0,154500.0,153133.0,147300.0,...,308000.0,311333.0,316000.0,328667.0,340333.0,348750.0,350917.0,344250.0,337867.0,328700


## Split the Data into Training and Testing Sets

In [16]:
# Temporarily remove columns that are not needed
sale_price_df.drop(sale_price_df.columns[[0,2]], axis=1, inplace=True)
sale_price_df.head()

Unnamed: 0,RegionName,4/30/2008,5/31/2008,6/30/2008,7/31/2008,8/31/2008,9/30/2008,10/31/2008,11/30/2008,12/31/2008,...,1/31/2022,2/28/2022,3/31/2022,4/30/2022,5/31/2022,6/30/2022,7/31/2022,8/31/2022,9/30/2022,10/31/2022
0,"New York, NY",405000.0,404667.0,408000.0,414667.0,421667.0,421667.0,410000.0,395000.0,385000.0,...,537167.0,537167.0,541833.0,546333.0,557667.0,574633.0,592967.0,606633.0,603333.0,591667
1,"Los Angeles, CA",542500.0,532833.0,526917.0,512417.0,500750.0,482333.0,470333.0,454333.0,445333.0,...,859167.0,875167.0,898333.0,934833.0,959833.0,964833.0,945000.0,920000.0,901667.0,891667
2,"Chicago, IL",245667.0,247333.0,252833.0,257833.0,261167.0,258333.0,248667.0,239333.0,232667.0,...,277833.0,276333.0,280833.0,290833.0,302667.0,313167.0,316500.0,313333.0,303333.0,295000
3,"Dallas, TX",141850.0,144817.0,148317.0,152300.0,154000.0,150000.0,144667.0,139500.0,136167.0,...,360667.0,365500.0,373833.0,393833.0,413333.0,425500.0,423833.0,413833.0,402167.0,393833
4,"Houston, TX",144133.0,145966.0,149166.0,152700.0,154500.0,153133.0,147300.0,139000.0,132000.0,...,308000.0,311333.0,316000.0,328667.0,340333.0,348750.0,350917.0,344250.0,337867.0,328700


In [17]:
sale_price_df.dtypes

RegionName     object
4/30/2008     float64
5/31/2008     float64
6/30/2008     float64
7/31/2008     float64
               ...   
6/30/2022     float64
7/31/2022     float64
8/31/2022     float64
9/30/2022     float64
10/31/2022      int64
Length: 176, dtype: object

In [18]:
print(sale_price_df.RegionName.unique())

['New York, NY' 'Los Angeles, CA' 'Chicago, IL' 'Dallas, TX' 'Houston, TX'
 'Washington, DC' 'Miami, FL' 'Philadelphia, PA' 'Atlanta, GA'
 'Phoenix, AZ' 'Boston, MA' 'San Francisco, CA' 'Riverside, CA'
 'Detroit, MI' 'Seattle, WA' 'Minneapolis, MN' 'San Diego, CA' 'Tampa, FL'
 'Denver, CO' 'St. Louis, MO' 'Baltimore, MD' 'Charlotte, NC'
 'Orlando, FL' 'San Antonio, TX' 'Portland, OR' 'Sacramento, CA'
 'Las Vegas, NV' 'Pittsburgh, PA' 'Austin, TX' 'Cincinnati, OH'
 'Kansas City, MO' 'Columbus, OH' 'Indianapolis, IN' 'Cleveland, OH'
 'San Jose, CA' 'Nashville, TN' 'Virginia Beach, VA' 'Providence, RI'
 'Jacksonville, FL' 'Milwaukee, WI' 'Oklahoma City, OK' 'Raleigh, NC'
 'Memphis, TN' 'Richmond, VA' 'New Orleans, LA' 'Louisville, KY'
 'Salt Lake City, UT' 'Hartford, CT' 'Buffalo, NY' 'Birmingham, AL'
 'Grand Rapids, MI' 'Rochester, NY' 'Tucson, AZ' 'Tulsa, OK' 'Fresno, CA'
 'Urban Honolulu, HI' 'Omaha, NE' 'Worcester, MA' 'Bridgeport, CT'
 'Greenville, SC' 'Albuquerque, NM' 'Bakersfield,

In [19]:
for col in sale_price_df.columns:
    attName = col
    dType = sale_price_df[col].dtype
    missing = pd.isnull(sale_price_df[col]).any()
    uniqueCount = len(sale_price_df[attName].value_counts(normalize=False))
# discretize (create dummies)
    if dType == object:
        sale_price_df = pd.concat([sale_price_df, pd.get_dummies(sale_price_df[col], prefix=col)], axis=1)
        del sale_price_df[attName]


In [11]:
# for col in sale_price_df.columns:
#     if sale_price_df[col].dtype == "object":
#         sale_price_df[col] = pd.to_numeric(sale_price_df[col], errors="coerce")

In [None]:
sale_price_df.dtypes

In [20]:
sale_price_df.head()

Unnamed: 0,4/30/2008,5/31/2008,6/30/2008,7/31/2008,8/31/2008,9/30/2008,10/31/2008,11/30/2008,12/31/2008,1/31/2009,...,"RegionName_Wichita, KS","RegionName_Wilmington, NC","RegionName_Winchester, VA","RegionName_Winston, NC","RegionName_Worcester, MA","RegionName_Yakima, WA","RegionName_York, PA","RegionName_Youngstown, OH","RegionName_Yuba City, CA","RegionName_Yuma, AZ"
0,405000.0,404667.0,408000.0,414667.0,421667.0,421667.0,410000.0,395000.0,385000.0,378333.0,...,0,0,0,0,0,0,0,0,0,0
1,542500.0,532833.0,526917.0,512417.0,500750.0,482333.0,470333.0,454333.0,445333.0,427333.0,...,0,0,0,0,0,0,0,0,0,0
2,245667.0,247333.0,252833.0,257833.0,261167.0,258333.0,248667.0,239333.0,232667.0,226667.0,...,0,0,0,0,0,0,0,0,0,0
3,141850.0,144817.0,148317.0,152300.0,154000.0,150000.0,144667.0,139500.0,136167.0,131817.0,...,0,0,0,0,0,0,0,0,0,0
4,144133.0,145966.0,149166.0,152700.0,154500.0,153133.0,147300.0,139000.0,132000.0,128000.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
sale_price_df.shape

(363, 538)

In [None]:
# Define the X (features) and y (target) sets
y = sale_price_df["RegionName"].values
X = sale_price_df.drop("RegionName", axis=1)
# X = pd.DataFrame(X)

In [None]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


## Create, Fit and Compare Models

Create a Linear Regression model, fit it to the data, and print the model's score. We will do the same for KNeighborRegressor,
RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, and SVR.

In [None]:
# Creating the Linear Regression model
model = LinearRegression()
model

In [None]:
# Fit the model to the training data, and calculate the scores for the training and testing data.
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

In [None]:
# Print the training score and test score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
# Plot the residuals for the training and testing data.
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")
plt.show()

## Not sure about the below test models

In [None]:
## Week 19, Day 3 Activity 10

In [None]:
from sklearn.datasets import make_regression, make_swiss_roll
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR

In [None]:
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [None]:
# Create data
X, y = make_regression(random_state=42)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [None]:
test_model(LinearRegression(), data)

test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

In [None]:
# # Create and train a Random Forest Classifier model and print the model score
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# scaler = StandardScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [None]:
# # Validate the model by using the test data
# clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
# print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
# print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
# features = clf.feature_importances_
# print(features)
# plt.bar(x = range(len(features)), height=features)
# plt.show()