In [1]:
# Generic Libraries being imported
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#Data Set Imported
from sklearn.datasets import fetch_california_housing

In [2]:
data=fetch_california_housing()

In [3]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [4]:
# Independent Data
df=pd.DataFrame(data=data.data,columns=data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
data.data.shape

(20640, 8)

In [6]:
# Dependent Variable
df['Target']=data.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Exploring Data

In [7]:
import sweetviz as sv
report=sv.analyze(df)
report.show_html("./report.html")

                                             |          | [  0%]   00:00 -> (? left)

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Pre-Processing

In [8]:
#Feature Engineering
from geopy.geocoders import Nominatim
geolocator=Nominatim(user_agent='geoapiExercises')

In [9]:
geolocator.reverse("37.88"+" , "+"-122.23").raw['address']

{'city': 'Berkeley',
 'country': 'United States',
 'country_code': 'us',
 'county': 'Alameda County',
 'leisure': 'Ecological Study Area',
 'neighbourhood': 'Panoramic Hill',
 'postcode': '94720-1076',
 'road': 'Panoramic Way',
 'state': 'California'}

In [10]:
def location(cord):
    Latitude=str(cord[0])
    Longitude=str(cord[1])
    
    #returns a dictionary
    location=geolocator.reverse(Latitude +" , "+Longitude).raw['address']
    
    if location.get('road') is None:
        location['road']=None
    
    if location.get('county') is None:
        location['county']=None
    loc_update['county'].append(location['county'])
    loc_update['road'].append(location['road'])

In [11]:
# import pickle
# loc_update={"county":[],"road":[]}

# for i, cord in enumerate(df.iloc[:,6:-1].values):
#     location(cord)
#     pickle.dump(loc_update, open('loc_update.pickle','wb'))
#     if i%100==0:
#       print(i)

In [12]:
import pickle
loc_update=pickle.load(open("./loc_update.pickle","rb"))
loc=pd.DataFrame(loc_update)
loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   county  15262 non-null  object
 1   road    19498 non-null  object
dtypes: object(2)
memory usage: 322.6+ KB


In [13]:
# Adding new feature to data frame
for i in loc_update.keys():
    df[i]=loc_update[i]
df=df.sample(axis=0, frac=1)


In [15]:
#Dropping latitude adn longitude from data frame
df.drop(labels=["Latitude", "Longitude"],axis=1,inplace=True)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,road
6113,1.5,5.0,3.620579,1.016077,819.0,2.633441,1.398,,North Cerritos Avenue
17715,5.614,15.0,5.854737,1.048421,1389.0,2.924211,2.233,Santa Clara County,Mount Prieta Drive
3860,4.0474,34.0,5.508403,0.981092,1233.0,2.590336,3.797,,Morrison Street
4215,2.9392,46.0,3.877404,0.951923,966.0,2.322115,2.303,,Edenhurst Avenue
7885,3.0989,21.0,4.013341,1.075197,4396.0,2.665858,1.714,,Rose Street


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20640 entries, 6113 to 17846
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   county      15262 non-null  object 
 8   road        19498 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.6+ MB


## Using Classification Algorithm to fill the missing data

In [17]:
# applying logistic regression to find the missing values
missing_idx=[]
for i in range(df.shape[0]):
    if df['road'][i] is None:
        missing_idx.append(i)
# Independent Parameters
missing_road_x_train=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i not in missing_idx])
# Dependent Parameters
missing_road_y_train=np.array([ df['road'][i] for i in range(df.shape[0]) if i not in missing_idx])
    
missing_road_x_test=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx])

In [18]:
from sklearn.linear_model import SGDClassifier

#Initiating the model
model_1=SGDClassifier()

#Training Model
model_1.fit(missing_road_x_train,missing_road_y_train)

missing_road_y_pred=model_1.predict(missing_road_x_test)

In [19]:
np.unique(missing_road_y_pred)

array(['20th Street', '30th Avenue', '55th Avenue', 'Alemany Boulevard',
       'Alpine Circle', 'Bradford Street', 'Broadway', 'Cedar Avenue',
       'East 106th Street', 'East Tulare Avenue', 'North Oakland Avenue',
       'Parmelee Avenue', 'Plover Circle', 'Robert Dollar Drive',
       'Rosedale Avenue', 'Taylor Road', 'West 135th Street',
       'West Tasman Drive'], dtype='<U77')

In [20]:
# add the modeled data back to the data frame
for n,i in enumerate(missing_idx):
  df['road'][i]=missing_road_y_pred[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

df['road']=le.fit_transform(df['road'])

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,road
6113,1.5,5.0,3.620579,1.016077,819.0,2.633441,1.398,,5379
17715,5.614,15.0,5.854737,1.048421,1389.0,2.924211,2.233,Santa Clara County,5151
3860,4.0474,34.0,5.508403,0.981092,1233.0,2.590336,3.797,,5122
4215,2.9392,46.0,3.877404,0.951923,966.0,2.322115,2.303,,2707
7885,3.0989,21.0,4.013341,1.075197,4396.0,2.665858,1.714,,6686


In [22]:
# applying logistic regression to find the missing values
missing_idx=[]
for i in range(df.shape[0]):
    if df['county'][i] is None:
        missing_idx.append(i)
# Independent Parameters
missing_county_x_train=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i not in missing_idx])
# Dependent Parameters
missing_county_y_train=np.array([ df['county'][i] for i in range(df.shape[0]) if i not in missing_idx])
    
missing_county_x_test=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx])

In [23]:
missing_county_y_train

array(['Alameda County', 'Alameda County', 'Alameda County', ...,
       'Yuba County', 'Yuba County', 'Yuba County'], dtype='<U22')

In [24]:
#Training Model
model_2=SGDClassifier()

model_2.fit(missing_county_x_train,missing_county_y_train)

missing_county_y_pred=model_2.predict(missing_county_x_test)

In [25]:
np.unique(missing_county_y_pred)

array(['Alameda County', 'Butte County', 'Fresno County',
       'Los Angeles County', 'Orange County', 'Placer County',
       'Riverside County', 'San Luis Obispo County', 'San Mateo County',
       'Sutter County'], dtype='<U22')

In [26]:
# add the modeled data back to the data frame
for n,i in enumerate(missing_idx):
    df['county'][i]=missing_county_y_pred[n]

df['county']=le.fit_transform(df['county'])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,road
6113,1.5,5.0,3.620579,1.016077,819.0,2.633441,1.398,10,5379
17715,5.614,15.0,5.854737,1.048421,1389.0,2.924211,2.233,45,5151
3860,4.0474,34.0,5.508403,0.981092,1233.0,2.590336,3.797,0,5122
4215,2.9392,46.0,3.877404,0.951923,966.0,2.322115,2.303,0,2707
7885,3.0989,21.0,4.013341,1.075197,4396.0,2.665858,1.714,43,6686


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20640 entries, 6113 to 17846
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   county      20640 non-null  int64  
 8   road        20640 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 2.2 MB


## Understanding which model to use

In [28]:
# Dependent Values
y=df.iloc[:,6].values
df1=df
df.drop(labels=['Target'],axis=1,inplace=True)

In [29]:
#Independent Values
x=df.iloc[:,:].values

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [31]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [32]:
# Model Prediction

y_pred=model.predict(x_test)

In [33]:
#Model Accuracy

from sklearn.metrics import r2_score

r2_score(y_test, y_pred)*100

72.57677775762006