In [342]:
# Generic Libraries being imported
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#Data Set Imported
from sklearn.datasets import fetch_california_housing

In [343]:
data=fetch_california_housing()

In [344]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [345]:
# Independent Data
df=pd.DataFrame(data=data.data,columns=data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [346]:
df.isnull()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
20635,False,False,False,False,False,False,False,False
20636,False,False,False,False,False,False,False,False
20637,False,False,False,False,False,False,False,False
20638,False,False,False,False,False,False,False,False


In [347]:
# Dependent Variable
df['Target']=data.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Exploring Data

In [313]:
import sweetviz as sv
report=sv.analyze(df)
report.show_html("./report.html")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=10.0), HTML(value='')), l…


Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Pre-Processing

In [348]:
#Feature Engineering
from geopy.geocoders import Nominatim
geolocator=Nominatim(user_agent='geoapiExercises')

In [349]:
geolocator.reverse("37.89"+" , "+"-122.23").raw['address']

{'road': 'Arroyo Trail',
 'county': 'Contra Costa County',
 'state': 'California',
 'postcode': '94708',
 'country': 'United States',
 'country_code': 'us'}

In [350]:
def location(cord):
    Latitude=str(cord[0])
    Longitude=str(cord[1])
    
    #returns a dictionary
    location=geolocator.reverse(Latitude +" , "+Longitude).raw['address']
    
    if location.get('road') is None:
        location['road']=None
    
    if location.get('county') is None:
        location['county']=None
    loc_update['county'].append(location['county'])
    loc_update['road'].append(location['road'])

In [351]:
# import pickle
# loc_update={"county":[],"road":[]}

# for i, cord in enumerate(df.iloc[:,6:-1].values):
#     location(cord)
#     pickle.dump(loc_update, open('loc_update.pickle','wb'))
#     if i%100==0:
#       print(i)

In [352]:
import pickle
loc_update=pickle.load(open("./loc_update.pickle","rb"))
loc=pd.DataFrame(loc_update)
loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   county  15262 non-null  object
 1   road    19498 non-null  object
dtypes: object(2)
memory usage: 322.6+ KB


In [353]:
# Adding new feature to data frame
for i in loc_update.keys():
    df[i]=loc_update[i]
df=df.sample(axis=0, frac=1)


In [354]:
#Dropping latitude adn longitude from data frame
df.drop(labels=["Latitude", "Longitude"],axis=1,inplace=True)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,road
1603,6.2275,34.0,6.215302,0.975089,2865.0,2.548932,3.944,Contra Costa County,Lakeside Nature Trail
9667,2.35,14.0,5.34188,1.162393,721.0,3.081197,0.957,Alpine County,Golden Gate Road
3654,2.425,17.0,3.479127,1.147059,2636.0,2.500949,1.755,,Saticoy Street
19957,3.6343,5.0,5.48227,1.035461,1283.0,3.033097,0.954,Tulare County,North Laspina Street
6930,2.9321,14.0,4.778058,1.021438,2551.0,3.216898,1.442,,Rosemead Boulevard


In [355]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20640 entries, 1603 to 5234
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   county      15262 non-null  object 
 8   road        19498 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.6+ MB


## Using Classification Algorithm to fill the missing data

In [356]:
# applying logistic regression to find the missing values
missing_idx=[]
for i in range(df.shape[0]):
    if df['road'][i] is None:
        missing_idx.append(i)
# Independent Parameters
missing_road_x_train=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i not in missing_idx])
# Dependent Parameters
missing_road_y_train=np.array([ df['road'][i] for i in range(df.shape[0]) if i not in missing_idx])
    
missing_road_x_test=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx])

In [357]:
from sklearn.linear_model import *

#Initiating the model
model_1=SGDClassifier()
#model_1 = LogisticRegression()
#Training Model
model_1.fit(missing_road_x_train,missing_road_y_train)

missing_road_y_pred=model_1.predict(missing_road_x_test)

In [358]:
np.unique(missing_road_y_pred)

array(['11th Avenue', '4th Street', 'Blue Horizon', 'Center Avenue',
       'Dog Bar Road', 'East 124th Street', 'Hurlingham Avenue',
       'John Street', 'Lemona Avenue', 'Los Angeles River Bikeway',
       'Louisiana Street', 'Merrill Drive', 'North Catalina Avenue',
       'North Fisher Street', 'North Fuller Avenue', 'Parkhills Avenue',
       'Serrano Avenue', 'South Citrus Avenue'], dtype='<U77')

In [359]:
# add the modeled data back to the data frame
for n,i in enumerate(missing_idx):
  df['road'][i]=missing_road_y_pred[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['road'][i]=missing_road_y_pred[n]


In [360]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

df['road']=le.fit_transform(df['road'])

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,road
1603,6.2275,34.0,6.215302,0.975089,2865.0,2.548932,3.944,Contra Costa County,4242
9667,2.35,14.0,5.34188,1.162393,721.0,3.081197,0.957,Alpine County,3332
3654,2.425,17.0,3.479127,1.147059,2636.0,2.500949,1.755,,6967
19957,3.6343,5.0,5.48227,1.035461,1283.0,3.033097,0.954,Tulare County,5471
6930,2.9321,14.0,4.778058,1.021438,2551.0,3.216898,1.442,,6692


In [361]:
# applying logistic regression to find the missing values
missing_idx=[]
for i in range(df.shape[0]):
    if df['county'][i] is None:
        missing_idx.append(i)
# Independent Parameters
missing_county_x_train=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i not in missing_idx])
# Dependent Parameters
missing_county_y_train=np.array([ df['county'][i] for i in range(df.shape[0]) if i not in missing_idx])
    
missing_county_x_test=np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx])

In [362]:
missing_county_y_train

array(['Alameda County', 'Alameda County', 'Alameda County', ...,
       'Yuba County', 'Yuba County', 'Yuba County'], dtype='<U22')

In [363]:
#Training Model
model_2=SGDClassifier()

model_2.fit(missing_county_x_train,missing_county_y_train)

missing_county_y_pred=model_2.predict(missing_county_x_test)

In [364]:
np.unique(missing_county_y_pred)

array(['Mono County', 'Placer County', 'San Francisco',
       'Santa Barbara County', 'Yuba County'], dtype='<U22')

In [365]:
# add the modeled data back to the data frame
for n,i in enumerate(missing_idx):
    df['county'][i]=missing_county_y_pred[n]

df['county']=le.fit_transform(df['county'])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['county'][i]=missing_county_y_pred[n]


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,road
1603,6.2275,34.0,6.215302,0.975089,2865.0,2.548932,3.944,6,4242
9667,2.35,14.0,5.34188,1.162393,721.0,3.081197,0.957,1,3332
3654,2.425,17.0,3.479127,1.147059,2636.0,2.500949,1.755,44,6967
19957,3.6343,5.0,5.48227,1.035461,1283.0,3.033097,0.954,56,5471
6930,2.9321,14.0,4.778058,1.021438,2551.0,3.216898,1.442,44,6692


In [366]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20640 entries, 1603 to 5234
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   county      20640 non-null  int32  
 8   road        20640 non-null  int32  
dtypes: float64(7), int32(2)
memory usage: 2.0 MB


## Understanding which model to use

In [367]:
# Dependent Values
y=df.iloc[:,6].values
df1=df
df.drop(labels=['Target'],axis=1,inplace=True)

In [368]:
#Independent Values
x=df.iloc[:,:].values

In [388]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [389]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

In [396]:
from sklearn.ensemble import *
#model = RandomForestRegressor(n_estimators=10,max_depth=50,random_state=15,bootstrap=False,verbose=10)
model = HistGradientBoostingRegressor(max_depth=8,random_state=42)
model.fit(x_train,y_train)

HistGradientBoostingRegressor(max_depth=8, random_state=42)

In [397]:
# Model Prediction
y_pred=model.predict(x_test)

In [398]:
#Model Accuracy

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)*100

78.43515492473406