# Peer-graded Assignment
## Capstone Project - The Battle of Neighborhoods (Week 1)

## Configuration

### Import the necessary libraries

In [None]:
import colorsys, folium, geocoder, imgkit, json, lxml, math, os, re, requests, sys, time, urllib
import numpy                    as np
import pandas                   as pd
import matplotlib.pyplot        as plt
import matplotlib.pylab         as pylab
import matplotlib.cm            as cm
import matplotlib.colors        as colors
import seaborn                  as sns
from   bs4                      import BeautifulSoup
from   geopy.geocoders          import Nominatim
from   geopy.distance           import distance
from   geopy.extra.rate_limiter import RateLimiter
from   sklearn.cluster          import KMeans
from   sklearn.svm              import SVR
from   sklearn.model_selection  import GridSearchCV
from   sklearn.preprocessing    import PolynomialFeatures, StandardScaler
from   sklearn                  import preprocessing, linear_model, model_selection, metrics
from   yellowbrick.cluster      import KElbowVisualizer
from   tqdm                     import tqdm
from   selenium                 import webdriver

### Configure Directories, Plotting Parameters, and Define the Cities of Interest

In [None]:
dataDir   = "./Data/"
figureDir = "./Figures/"

pylab.rcParams.update({"figure.figsize" : (10, 7),
                       "axes.labelsize" : "x-large",
                       "axes.titlesize" : "x-large",
                       "legend.fontsize": "x-large",
                       "xtick.labelsize": "x-large",
                       "ytick.labelsize": "x-large"})

citiesOfInterest = [{"name": "Brisbane",  "state": {"name": "Queensland",        "abbreviation": "qld"}},
                    {"name": "Melbourne", "state": {"name": "Victoria",          "abbreviation": "vic"}},
                    {"name": "Sydney",    "state": {"name": "New South Wales",   "abbreviation": "nsw"}},
                    {"name": "Perth",     "state": {"name": "Western Australia", "abbreviation": "wa"}}]


## Generate Dataset

### Scrape Suburb Price Data with Beautiful Soup

In [None]:
def getSuburbAndPriceData(city):

    maxNumPages = 30
    data        = pd.DataFrame(columns=["Suburb", "Search Name", "Price [$]"])

    for page in range(maxNumPages):

        url    = "http://house.speakingsame.com/suburbtop.php?sta={}&cat=HomePrice&name=&page={}" \
                 .format(city["state"]["abbreviation"], page)
        source = requests.get(url).text
        soup   = BeautifulSoup(source, "lxml")

        for row in soup.find_all("table")[-2].find_all("tr"):

            if row.find_all("td", text=re.compile("\$\d+(?:\,\d+)?")):

                suburb  = row.td.next_sibling.text
                price   = row.td.next_sibling.next_sibling.text
                data    = data.append({"Suburb":      suburb,
                                       "Search Name": suburb + " " + " Australia",
                                       "Price [$]":   price.strip("$")},
                                       ignore_index=True)
                sys.stdout.write(".")
        sys.stdout.write("\n")
                
    return data

### Get Geographical Coordinates with Nominatum

In [None]:
def getLocationData(city, data):
    
    tqdm.pandas()
    
    geolocator = Nominatim(user_agent="AustralianExplorer")
    geocode    = RateLimiter(geolocator.geocode, min_delay_seconds=1, swallow_exceptions=True)
    cbd        = geolocator.geocode(city["name"], timeout=1000)

    data["Location [Deg]"]       =  data["Search Name"].progress_apply(geocode).apply(lambda location: (location.latitude, location.longitude) if location else None)
    data["Distance to CBD [km]"] = [ distance(location, (cbd.latitude, cbd.longitude)).km for location in data["Location [Deg]"] ]

    return data

### Get Nearby Venue Information with Foursquare

In [None]:
CLIENT_ID     = "HIIHGUND2MSZDULMWUO213NKCZHP0U2DNMLYVPWSWR25MZND" # My Foursquare ID
CLIENT_SECRET = "OCIZI2HV4RUH2XCOEQYCX5CGQPGGCHWTPTOAFX4LSBZMESGY" # My Foursquare Secret
VERSION       = "20180605"                                         # Foursquare API version

In [None]:
def getNearbyVenues(locations, radius=1000, limit=100):
    
    nearbyVenues = pd.DataFrame(columns=["Suburb", 
                                         "Location [Deg]", 
                                         "Distance to CBD [km]",
                                         "Venue Name",
                                         "Venue Category",
                                         "Price [$]"])
    
    for suburb, location, distance, price in zip(locations["Suburb"], 
                                                 locations["Location [Deg]"], 
                                                 locations["Distance to CBD [km]"],
                                                 locations["Price [$]"]):
        location = eval(location)
        attempt = 0
        success = False
        url     = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}" \
                  .format(CLIENT_ID, CLIENT_SECRET, VERSION, location[0], location[1], radius, limit)
            
#         sys.stdout.write("Exploring " + suburb + ": ")
        while not success and attempt<10:
            sys.stdout.write(".")
            try:
                results = requests.get(url).json()["response"]["groups"][0]["items"]  
                for result in results:
                    nearbyVenues = nearbyVenues.append(
                             {"Suburb":               suburb, 
                              "Location [Deg]":       location, 
                              "Distance to CBD [km]": distance,
                              "Price [$]":            price,
                              "Venue Name":           result["venue"]["name"],
                              "Venue Category":       result["venue"]["categories"][0]["name"]}, 
                              ignore_index=True)
                success  = True
                sys.stdout.write("\n")
            except:
                attempt += 1
                pass
            if not success:
                sys.stdout.write("Unable to explore, hence it will be dropped.\n")
    
    return(nearbyVenues)

### Compile Data for each City of Interest

In [None]:
maxDistanceFromCBD = 40.0 # [km]
for city in citiesOfInterest:
    data = getSuburbAndPriceData(city)
    data = getLocationData(city, data)
    data.dropna(inplace=True)
    data.drop(data[data["Distance to CBD [km]"].astype(float)>maxDistanceFromCBD].index, inplace=True)
    data.to_csv(dataDir+city["name"]+"Data.csv", index=False)
    venues = getNearbyVenues(data)
    venues.to_csv(dataDir+city["name"]+"Venues.csv", index=False) 

## Analysis

### Preliminary Visualisations of Suburb Prices

In [None]:
plt.figure()
for city in citiesOfInterest:
    cityData    = pd.read_csv(dataDir+city["name"]+"Data.csv")
    priceValues = cityData["Price [$]"].apply(lambda price : float(price.replace(",",""))/1.0e6)
    distPlot    = sns.distplot(priceValues, hist=False, rug=False, label=city["name"])
plt.xlim(0.0, 5.0)
plt.xlabel("Price [$M]")
plt.show()
distPlot.get_figure().savefig(figureDir+"SuburbPriceDistributionsForAllCities.png")

### Functions to Create and Save Folium Maps with Markers

In [None]:
def saveFoliumMapAsPNG(city, cityMap, Visualisation):
    mapDummyName = "Map.html"
    driver       = webdriver.Chrome()
    cityMap.save(figureDir+mapDummyName)
    driver.set_window_size(800, 600)
    driver.get("file://"+os.path.join(urllib.request.pathname2url(os.getcwd()), figureDir)+mapDummyName)
    driver.save_screenshot(figureDir+city["name"]+Visualisation+".png")
    driver.quit()
    return

In [None]:
def generateFoliumMap(city, data, numMarkers, markerVariable, markerName):
    
    geolocator  = Nominatim(user_agent="AustralianExplorer")
    cityData    = pd.read_csv(dataDir+city["name"]+"Data.csv")
    cbd         = geolocator.geocode(city["name"])
    cityMap     = folium.Map(location=[cbd.latitude, cbd.longitude], zoom_start=10)
    colours     = [colors.rgb2hex(i) for i in cm.autumn(np.linspace(0, 1, numMarkers))]

    for name, location, price, marker in zip(cityData["Suburb"], 
                                             cityData["Location [Deg]"], 
                                             cityData["Price [$]"], 
                                             markerVariable):
        folium.CircleMarker(eval(location), 
                            popup=name+", $"+price, 
                            color=colours[marker],
                            radius=4,
                            fill=True,
                            fill_color=colours[marker],
                            fill_opacity=0.7).add_to(cityMap)
    cityMap
    saveFoliumMapAsPNG(city, cityMap, markerName)

### Folium Map to Visualise the Suburbs by Price [$M]

In [None]:
for city in citiesOfInterest:
    data        = pd.read_csv(dataDir+city["name"]+"Data.csv")
    numBins     = 3
    priceValues = data["Price [$]"].apply(lambda price : float(price.replace(",",""))/1.0e6)
    bins        = pd.qcut(priceValues, q=numBins, labels=False)
    generateFoliumMap(city, data, numBins, bins, "Prices")

### Folium Map to Visualise the Suburbs by Distance to CBD [km]

In [None]:
for city in citiesOfInterest:
    data        = pd.read_csv(dataDir+city["name"]+"Data.csv")
    numBins     = 10
    distances   = data["Distance to CBD [km]"]
    bins        = pd.qcut(data["Distance to CBD [km]"], q=numBins, labels=False)
    generateFoliumMap(city, data, numBins, bins, "DistanceToCBD")

### Analyse Each Suburb

In [None]:
def sortVenues(row, numTopVenues):
    rowCategories       = row.iloc[1:]
    rowCategoriesSorted = rowCategories.sort_values(ascending=False)
    return rowCategoriesSorted.index.values[0:numTopVenues]

In [None]:
def getOneHotVenueCategories(city):
    
    cityVenues             = pd.read_csv(dataDir+city["name"]+"Venues.csv")
    venuesOneHot           = pd.get_dummies(cityVenues[["Venue Category"]], prefix="", prefix_sep="")
    venuesOneHot["Suburb"] = cityVenues["Suburb"]
    venuesGrouped          = venuesOneHot.groupby("Suburb").sum().reset_index()
    venuesGrouped.drop(["Suburb"], 1, inplace=True)
    
    return venuesGrouped

In [None]:
def getMostComonVenues(city):
    
    numTopVenues  = 10
    cityVenues    = pd.read_csv(dataDir+city["name"]+"Venues.csv")
    venuesSorted  = pd.DataFrame(columns=["Suburb", "Location [Deg]", "Distance to CBD [km]", "Price [$]"] + ["Venue #{}".format(index+1) for index in range(numTopVenues)])
    venuesGrouped = getOneHotVenueCategories(city)
    
    venuesSorted["Suburb"] = cityVenues["Suburb"].unique()
    for index in np.arange(venuesGrouped.shape[0]):
        jndex = cityVenues.index[cityVenues["Suburb"]==venuesSorted.iloc[index, 0]][0]
        venuesSorted.iloc[index, 1:3] = cityVenues.iloc[jndex,1:3]
        venuesSorted.iloc[index, 3  ] = cityVenues.iloc[jndex, -1]
        venuesSorted.iloc[index, 2  ] = cityVenues.iloc[jndex,2  ]
        venuesSorted.iloc[index, 4:]  = sortVenues(venuesGrouped.iloc[index, :], numTopVenues)
    
    return venuesSorted


In [None]:
def saveDataFrameAsPNG(data, outputfile, format="png"):

    css = """
    <style type=\"text/css\">
    table {
    color: #333;
    font-family: Helvetica, Arial, sans-serif;
    border-collapse:
    collapse; 
    border-spacing: 0;
    }
    td, th {
    border: 1px solid transparent; 
    height: 30px;
    }
    th {
    background: #FAFAFA; /* Darken header a bit */
    text-align: center;
    font-weight: bold;
    font-size: 10px;
    }
    td {
    background: #CCCCCC;
    text-align: center;
    font-size: 10px;
    }
    table tr:nth-child(odd) td{
    background-color: white;
    }
    </style>
    """
    
    fn = str(np.random.random()*100000000).split(".")[0] + ".html"
    
    try:
        os.remove(fn)
    except:
        None
    text_file = open(fn, "a")
    
    text_file.write(css)
    text_file.write(data.to_html())
    text_file.close()
    imgkitoptions = {"format": format}
    imgkit.from_file(fn, outputfile, options=imgkitoptions)
    os.remove(fn)
    
    return

### Create a DataFrame with the Top 10 Most Common Venues

In [None]:
for city in citiesOfInterest:
    
    venuesSorted = getMostComonVenues(city)
    venuesSorted.to_csv(dataDir+city["name"]+"MostCommonVenues.csv", index=False)
    

In [None]:
for city in citiesOfInterest:
    
    venuesSorted = pd.read_csv(dataDir+city["name"]+"MostCommonVenues.csv")
    
    venuesSorted["Price [$]"] = venuesSorted["Price [$]"].apply(lambda price : float(price.replace(",",""))/1.0e6)

    venuesSorted.sort_values(by=["Price [$]"], ascending=False)
    venuesSorted.rename(columns={"Price [$]": "Price [$M]"}, inplace=True)
    
    venuesSorted["Price [$M]"]           = venuesSorted["Price [$M]"]          .apply(lambda distance : "{:.1f}".format(distance))
    venuesSorted["Distance to CBD [km]"] = venuesSorted["Distance to CBD [km]"].apply(lambda distance : "{:.1f}".format(distance))
    venuesSorted.drop(["Location [Deg]"],               1, inplace=True)
    venuesSorted.drop(range(2,venuesSorted.shape[0]-2), 0, inplace=True)

    saveDataFrameAsPNG(venuesSorted, figureDir+city["name"]+"MostCommonVenues.png")
    

### K-Means Analysis to Cluster Suburbs

In [None]:
for city in citiesOfInterest:
    
    plt.figure()
    venuesGrouped = getOneHotVenueCategories(city)
    visualizer    = KElbowVisualizer(KMeans(), k=(2,30))

    visualizer.fit(venuesGrouped) 
    visualizer.show(figureDir+city["name"]+"DistortionScoreElbow.png")

    numClusters = visualizer.elbow_value_
    model       = KMeans(n_clusters=numClusters, random_state=0).fit(venuesGrouped)

    venuesGrouped["Cluster Label"] = model.labels_
    
    generateFoliumMap(city, venuesGrouped, numClusters, venuesGrouped["Cluster Label"], "Clusters")

### Create Polynomial Regression Model to Predict Median House Price from Distance to CBD

In [None]:
for city in citiesOfInterest:
    
    cityData = pd.read_csv(dataDir+city["name"]+"Data.csv")
    X        = cityData["Distance to CBD [km]"].astype(float).values.reshape((cityData.shape[0],1))
    y        = cityData["Price [$]"].apply(lambda price : float(price.replace(",",""))/1.0e6).astype(float).values
    model    = linear_model.LinearRegression()
    poly     = PolynomialFeatures(degree=2)
    X_train  = poly.fit_transform(X)
    y_train  = y
    y_test   = model.fit(X_train, y_train).predict(X_train)

    plt.figure(figsize=(10,7))
    plt.scatter(X, y)
    XX = np.sort(X, axis=None)
    yy = model.intercept_ + model.coef_[1]*XX+ model.coef_[2]*np.power(XX, 2)

    print("\n",city["name"])
    print("Mean squared error:", metrics.mean_squared_error(y, yy))


    plt.plot(XX, yy, "-r")
    plt.xlim(0.0, 40.0)
    plt.ylim(0.0,  2.0)
    plt.xlabel("Distance to CBD [km]")
    plt.ylabel("Price [$M]")
    plt.title(city["name"])
    plt.savefig(figureDir+city["name"]+"PriceVsDistanceToCBD.png")
    

### Create Support Vector Regressor to Predict Median House Price from Neighbouring Venues

In [None]:
for city in citiesOfInterest:
    
    cityData               = pd.read_csv(dataDir+city["name"]+"MostCommonVenues.csv")
    cityData["Price [$M]"] = cityData["Price [$]"].apply(lambda price : float(price.replace(",",""))/1e6)

    X = getOneHotVenueCategories(city).values.astype(float)
    y = cityData["Price [$M]"] 
    X = StandardScaler().fit(X).transform(X)

    paramGrid = {"kernel": ["linear", "poly", "rbf"], 
                 "degree": np.arange(1, 6), 
                 "gamma":  ["scale", "auto"],
                 "C":      np.logspace(-4,0,5)}
    svrSearch = GridSearchCV(SVR(), paramGrid, cv=5).fit(X, y)
    print("SVM best params:", svrSearch.best_params_, ", best score:", svrSearch.best_score_)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=1)
    
    model     = SVR(kernel=svrSearch.best_params_["kernel"],
                    degree=svrSearch.best_params_["degree"],
                    gamma =svrSearch.best_params_["gamma"],
                    C     =svrSearch.best_params_["C"]).fit(X_train, y_train)

    
    print(city["name"], "Mean squared error:", metrics.mean_squared_error(y_test, model.predict(X_test)))
