# To Do & Goals
- Suspect that some of the listings that are not homes. Probably need to create ML model to try and classify homes from non-homes.
- Explore missingno documentation.  Can I create a function that also returns the % of values that are missing per row?
- Can I geocode street addresses into lat & long values?  Might be able use an google api with account for this.

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
#import geopandas as gpd # the library that lets us read in shapefiles
#import geoplot as gplt # for plotting maps #having trouble getting this to install

# visulizaiton
from termcolor import colored # colored text
import missingno as msno # visuzlise missing data in a matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Setting work directory
cwd = os.getcwd()
print("current directory = " + cwd)

# os.chdir("../NotEssentialData")
# print("new directory = " + os.path.abspath(os.curdir))

## Input Data
- https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset?resource=download
This dataset contains Real Estate listings in the US broken by State and zip code. Data was collected via web scraping using python libraries.

In [None]:
#Input file
fileInput = "data/realtor-data.csv"
df = pd.read_csv(fileInput).reset_index(drop=True)
print(len(df))
df.head(1)

## Explore the Data

In [None]:
df.info()

In [None]:
# explore missing values

msno.matrix(df, figsize=(10,5), fontsize=11)

In [None]:
# Pair Plot features agaist one another
# large dataset, having difficulty getting this to work
sns.pairplot(df)

In [None]:
# function to show the distriubtion of numieric features
def ShowDistributionFuc(var):
    '''
    This function will only works with numierc values.
    Shows statistics & displays a histogram - boxplot combo.
    '''

    # Get statistics
    min_val = var.min()
    mean_val = var.mean()
    med_val = var.median()
    mod_val = var.mode()[0]
    max_val = var.max()
    print(colored('Min: ' + str(min_val), 'grey'))
    print(colored('Mean: ' + str(mean_val), 'cyan'))
    print(colored('Median: ' + str(med_val), 'red'))
    print(colored('Mode: ' + str(mod_val), 'yellow'))
    print(colored('Max: ' + str(max_val), 'grey'))


    # Create a figure for 2 subplots (2 rows, 1 column)(historgram & boxplot)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))
    fig.suptitle('Data Distribution')

    # Plot the histogram, add lines for the mean, median, and mode
    ax[0].hist(var)
    ax[0].set_ylabel('Frequency')
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot
    ax[1].boxplot(var, vert=False)
    ax[1].set_xlabel('Value')

    fig.show()

In [None]:
# Distribution of 'price'

ShowDistributionFuc(df['price'])

In [None]:
# Inspect high price rows
df1=df[df['price'] >= 875000000]
df1

In [None]:
# Distribution of 'bed'

ShowDistributionFuc(df['bed'])

In [None]:
# Distribution of 'bath'

ShowDistributionFuc(df['bath'])

In [None]:
# Distribution of 'house_size'

ShowDistributionFuc(df['house_size'])

## Clean the Data
- create new working dataframe
- remove duplicate entries
- remove households with a price >= $1,000,000

In [None]:
# create new dataframe
# remove duplicate entries

df1 = df.drop_duplicates().reset_index(drop=True)
print(len(df1))
df1.head(1)

In [None]:
# remove households with pirce >= $1,000,000
df1 = df1[df1['price'] < 1000000]
print(len(df1))
print(ShowDistributionFuc(df1['price']))
df1.head(1)

In [None]:
print(ShowDistributionFuc(df1['bed']))

In [None]:
df2 = df1[df1['bed'] > 10]
df2.head()

## Clustering
- Use Principal Component Analysis (PCA) to analyze the relationships between the features and summarize each observation as coordinates for two principal components.

In [None]:
# create dataframe of numeric features
featureColumns = ['price', 'bed', 'bath', 'acre_lot', 'house_size']
dfml = df[featureColumns].copy()
dfml = dfml.dropna().reset_index(drop=True)
print(len(dfml))
dfml.head(1)

In [None]:
features = dfml[dfml.columns[0:5]]
features

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# Normalize the numeric features so they're on the same scale
scaled_features = MinMaxScaler().fit_transform(features[dfml.columns[0:5]])

# Get two principal components
pca = PCA(n_components=2).fit(scaled_features)
features_2d = pca.transform(scaled_features)
features_2d[0:10]

In [None]:
# visualize the translated two dimensions in a plot with matplotlib

plt.scatter(features_2d[:,0],features_2d[:,1])
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Data')
plt.show()

In [None]:
# # K-Means Clustering
# # Let's try using K-Means on our seeds data with a K value of 3.

# from sklearn.cluster import KMeans

# # Create a model based on 3 centroids
# model = KMeans(n_clusters=3, init='k-means++', n_init=100, max_iter=1000)
# # Fit to the data and predict the cluster assignments for each data point
# km_clusters = model.fit_predict(features.values)
# # View the cluster assignments
# km_clustersimport matplotlib.pyplot as plt

# %matplotlib inline

# plt.scatter(features_2d[:,0],features_2d[:,1])
# plt.xlabel('Dimension 1')
# plt.ylabel('Dimension 2')
# plt.title('Data')
# plt.show()

In [None]:
# # Visualize those cluster assignments

# import matplotlib.pyplot as plt
# %matplotlib inline

# def plot_clusters(samples, clusters):
#     col_dic = {0:'blue',1:'green',2:'orange'}
#     mrk_dic = {0:'*',1:'x',2:'+'}
#     colors = [col_dic[x] for x in clusters]
#     markers = [mrk_dic[x] for x in clusters]
#     for sample in range(len(clusters)):
#         plt.scatter(samples[sample][0], samples[sample][1], color = colors[sample], marker=markers[sample], s=100)
#     plt.xlabel('Dimension 1')
#     plt.ylabel('Dimension 2')
#     plt.title('Assignments')
#     plt.show()

# plot_clusters(features_2d, km_clusters)

## Convert street address to lat & long

In [None]:
# convert street address to lat & long
from geopy.extra.rate_limiter import RateLimiter

locator = Nominatim(user_agent=”myGeocoder”)
location = locator.geocode(“Champ de Mars, Paris, France”)

In [None]:
import requests

response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=1600+Amphitheatre+Parkway,+Mountain+View,+CA')
resp_json_payload = response.json()
resp_json_payload
#print(resp_json_payload['results'][0]['geometry']['location'])