In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#Data Sets

from sklearn.datasets import fetch_california_housing

In [3]:
#get dataset

data = fetch_california_housing()

In [4]:
data.DESCR


'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block group\n        - HouseAge      median house age in block group\n        - AveRooms      average number of rooms per household\n        - AveBedrms     average number of bedrooms per household\n        - Population    block group population\n        - AveOccup      average number of household members\n        - Latitude      block group latitude\n        - Longitude     block group longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000

In [5]:
#Independent data

df = pd.DataFrame(data =data.data, columns=data.feature_names)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [6]:
#Dependent data

df['Target'] = data.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
df.iloc[1][7]

-122.22

# Exploratory Data Analysis

In [8]:
! pip install sweetviz



In [9]:
import sweetviz as sv 


In [10]:
report = sv.analyze(df)
report.show_html("./report.html")

                                             |          | [  0%]   00:00 -> (? left)

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.




```
# This is formatted as code
```

# Data Preprocessing

In [11]:
##Feature Engineering

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent = 'geoapiExcercies')

In [12]:
geolocator.reverse(df.iloc[1][6].astype(str)+" , " +df.iloc[1][7].astype(str)).raw

{'address': {'city': 'Oakland',
  'country': 'United States',
  'country_code': 'us',
  'county': 'Alameda County',
  'leisure': 'Grizzly Peak Open Space',
  'postcode': '94618',
  'road': 'Charing Cross Road',
  'state': 'California'},
 'boundingbox': ['37.8561428', '37.8646719', '-122.223868', '-122.2132544'],
 'display_name': 'Grizzly Peak Open Space, Charing Cross Road, Oakland, Alameda County, California, 94618, United States',
 'lat': '37.8603542',
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'lon': '-122.21859099550318',
 'osm_id': 9244814,
 'osm_type': 'relation',
 'place_id': 283503815}

In [13]:
df2= df.iloc[:1000]   #Only for 1000 rows
df2

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
995,4.8624,11.0,5.680000,1.044706,5826.0,2.741647,37.71,-121.75,1.924
996,9.1531,25.0,5.811765,0.952941,254.0,2.988235,37.74,-121.77,4.188
997,4.7361,22.0,6.080220,1.036264,2474.0,2.718681,37.70,-121.80,2.168
998,5.4324,17.0,5.975831,0.965257,2222.0,3.356495,37.69,-121.80,2.155


In [14]:
def location(cord):
  latitude = str(cord[0])
  longitude = str(cord[1])

  location = geolocator.reverse(latitude+","+longitude).raw['address']

  # if the values are missing replace by empty string:
  if location.get('road') is None:
    location['road'] = None

  if location.get('county') is None:
    location['county']= None
  loc_update['Road'].append(location['road'])
  loc_update['county'].append(location['county'])



In [15]:
# import pickle

# loc_update = {
#     "county": [],
#     "Road":[],
#     "Neighbourhood":[]
# }
# for i,cord in enumerate(df2.iloc[:,6:-1].values):

#   location(cord)
#   #continuously Reading our data and Saving it on the go !!!
#   pickle.dump(loc_update, open('loc_update.pickle','wb'))

#   if i%100 ==0:
#     print(i)

In [16]:
import pickle
loc_update  = pickle.load(open('/content/drive/MyDrive/Deep Learning/loc_update.pickle', 'rb'))

In [17]:
loc_update.keys()

dict_keys(['county', 'Road', 'Neighbourhood'])

In [18]:
len(loc_update['county'])

1000

In [19]:
len(loc_update['Road'])

1000

In [20]:
len(loc_update['Neighbourhood'])

0

In [21]:
loc_update.pop('Neighbourhood')

[]

In [22]:
loc_update.keys()

dict_keys(['county', 'Road'])

In [23]:
loc = pd.DataFrame(loc_update)

In [24]:
loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   county  1000 non-null   object
 1   Road    986 non-null    object
dtypes: object(2)
memory usage: 15.8+ KB


In [25]:
# Add the new features to dataframe

for i in loc_update.keys():
  df2[i] = loc_update[i]

df2= df2.sample(axis=0, frac=1)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target,county,Road
900,4.75,35.0,5.513317,1.016949,1353.0,3.276029,37.53,-121.97,1.97,Alameda County,Grimmer Boulevard
538,2.5762,29.0,4.048704,1.095051,3741.0,2.938727,37.78,-122.28,1.734,Alameda County,Ralph Appezzato Memorial Parkway
972,7.4353,7.0,7.762408,1.052889,3926.0,3.194467,37.67,-121.9,3.898,Alameda County,Corte Trancas
284,3.6875,42.0,5.707865,0.973783,651.0,2.438202,37.78,-122.17,1.573,Alameda County,
887,2.9074,16.0,3.492906,1.10247,4649.0,2.442985,37.55,-121.99,2.138,Alameda County,Fremont Boulevard


In [26]:
#drop lattitude, longitude and neighbourhood

df2 = df2.drop(labels=['Latitude','Longitude'], axis=1)

In [27]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 900 to 129
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      1000 non-null   float64
 1   HouseAge    1000 non-null   float64
 2   AveRooms    1000 non-null   float64
 3   AveBedrms   1000 non-null   float64
 4   Population  1000 non-null   float64
 5   AveOccup    1000 non-null   float64
 6   Target      1000 non-null   float64
 7   county      1000 non-null   object 
 8   Road        986 non-null    object 
dtypes: float64(7), object(2)
memory usage: 78.1+ KB


In [48]:
values, counts = np.unique(df2, return_counts=True)
len(df2)

1000

In [28]:
# applying logistic regression to find missing values

missing_idx = []

for i in range(df2.shape[0]):
  if df2['Road'][i] is None:
    missing_idx.append(i)

#Independent parameters
missing_Road_x_train = np.array([ [df2['MedInc'][i], df2['AveRooms'][i], df2['AveBedrms'][i] ] for i in range(df2.shape[0])  if i not in missing_idx])

#Dependent Parameter
missing_Road_y_train = np.array([ df2['Road'][i]  for i in range(df2.shape[0]) if i not in missing_idx])

missing_Road_x_test = np.array([ [df2['MedInc'][i], df2['AveRooms'][i], df2['AveBedrms'][i] ]  for i in range(df2.shape[0]) if i not in missing_idx ])

missing_Road_x_train

array([[8.3252    , 6.98412698, 1.02380952],
       [8.3014    , 6.23813708, 0.97188049],
       [7.2574    , 8.28813559, 1.07344633],
       ...,
       [4.7361    , 6.08021978, 1.03626374],
       [5.4324    , 5.97583082, 0.9652568 ],
       [4.9375    , 5.97492163, 1.10031348]])

In [29]:
from sklearn.linear_model import SGDClassifier

# model initialization

model_1 = SGDClassifier()

#Model Training

model_1.fit(missing_Road_x_train, missing_Road_y_train)

missing_Road_y_pred = model_1.predict(missing_Road_x_test)

In [31]:
# add the model back to df2

for n,i in enumerate(missing_idx):
  df2['Road'][i] = missing_Road_y_pred[n]

# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df2['Road']= le.fit_transform(df2['Road'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [32]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 900 to 129
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      1000 non-null   float64
 1   HouseAge    1000 non-null   float64
 2   AveRooms    1000 non-null   float64
 3   AveBedrms   1000 non-null   float64
 4   Population  1000 non-null   float64
 5   AveOccup    1000 non-null   float64
 6   Target      1000 non-null   float64
 7   county      1000 non-null   object 
 8   Road        1000 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 118.1+ KB


In [33]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df2['county']= le.fit_transform(df2['county'])


Undertanding which model to use 

# Implementation of Random Forest

In [86]:
df2

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,county,Road
900,4.7500,35.0,5.513317,1.016949,1353.0,3.276029,1.970,0,156
538,2.5762,29.0,4.048704,1.095051,3741.0,2.938727,1.734,0,280
972,7.4353,7.0,7.762408,1.052889,3926.0,3.194467,3.898,0,94
284,3.6875,42.0,5.707865,0.973783,651.0,2.438202,1.573,0,291
887,2.9074,16.0,3.492906,1.102470,4649.0,2.442985,2.138,0,137
...,...,...,...,...,...,...,...,...,...
939,3.7364,34.0,4.959315,1.023555,1524.0,3.263383,1.904,0,104
612,4.0662,52.0,5.578125,1.046875,440.0,2.291667,2.042,0,219
122,6.3434,52.0,6.947891,1.019851,1061.0,2.632754,3.736,0,361
166,1.6121,41.0,3.561743,1.135593,1062.0,2.571429,1.714,0,262


In [104]:
# Dependent features

y = df2.iloc[:, -3].values
df2 = df2.drop(labels='Target', axis=1)

x= df2.iloc[:,:].values

In [105]:
x.shape

(1000, 8)

In [106]:
from sklearn.model_selection import train_test_split

x_train, xtest, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state =42)

In [108]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(x_train, y_train)

RandomForestRegressor()

In [109]:
# Model Prediction

y_pred = model.predict(xtest)


In [113]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)*100

75.7146073830167

In [112]:
#Add our own data

In [114]:
inp = np.array([4.7500, 	35.0, 	5.513317, 	1.016949, 	1353.0 ,	3.276029, 	0, 	156])

In [118]:
inp = inp.reshape((1,-1))
model.predict(inp)

array([1.98175])

Summary:


Used sklearn housing dataset  
EDA using sweetviz  
Location is detected from Latitude and Longitude   
Filled up incomplete data using Logistic regression  
Trained a model and tested it using Random Forest   