In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
# Dataset from sklearn 
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)

In [4]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [5]:
#independent data
data_frame = pd.DataFrame(data.data,columns= data.feature_names)

In [6]:
data.data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [7]:
data_frame.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
# Target is basically the final house price 
data_frame['Target/ House Price'] = data.target

In [9]:
data_frame.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target/ House Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
data_frame.loc[:,"Latitude"]

0        37.88
1        37.86
2        37.85
3        37.85
4        37.85
         ...  
20635    39.48
20636    39.49
20637    39.43
20638    39.43
20639    39.37
Name: Latitude, Length: 20640, dtype: float64

In [11]:
import sweetviz as sv
report = sv.analyze(data_frame)
report.show_html("./Report.html")

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


Report ./Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [12]:
# Data Preprocessing and Feature Engineering 

In [13]:
from geopy.geocoders import Nominatim
geolocater = Nominatim(user_agent = 'geoapiExercises')

In [14]:
geolocater.reverse("37.88"+", "+"-122.23")

Location(Ecological Study Area, Grizzly Peak Boulevard, Oakland, Alameda County, California, 94720, United States, (37.87563745, -122.22856355341203, 0.0))

In [15]:
data_frame['Longitude']

0       -122.23
1       -122.22
2       -122.24
3       -122.25
4       -122.25
          ...  
20635   -121.09
20636   -121.21
20637   -121.22
20638   -121.32
20639   -121.24
Name: Longitude, Length: 20640, dtype: float64

In [16]:

data_frame['full_address'] = data_frame['Latitude'].map(str) + "," + data_frame['Longitude'].map(str)

In [17]:
data_frame.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target/ House Price,full_address
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,"37.88,-122.23"
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,"37.86,-122.22"
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,"37.85,-122.24"
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,"37.85,-122.25"
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,"37.85,-122.25"


In [18]:
x = data_frame["Latitude"].to_list()
y = data_frame["Longitude"].to_list()
coords = list(zip(x,y))
coords[1]

(37.86, -122.22)

In [19]:
geolocater.reverse(coords[1])

Location(Grizzly Peak Open Space, Caldecott Lane, Parkwoods Condominiums, Oakland, Alameda County, California, 94618, United States, (37.8603542, -122.21859099550318, 0.0))

In [20]:
import folium
locations = data_frame[['Latitude', 'Longitude']]
locationlist = locations.values.tolist()
len(locationlist)

20640

In [21]:
map = folium.Map(location=[37.84, -122.25], width='50%', height='60%',
    control_scale=True,zoom_start=5)
map

In [22]:
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point]).add_to(map)
map