### House Price Prediction ###

In [1]:
# Generic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Data set
from sklearn.datasets import fetch_california_housing

In [3]:
# Load data set
data = fetch_california_housing()

In [4]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [5]:
# Load data to a dataframe (Independent Data)
df = pd.DataFrame(data = data.data[:1000], columns = data.feature_names) ## get first 2500 data rows
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
data.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [7]:
# Adding target column to the dataframe (Dependent Data)
df['Target'] = data.target[:1000]

In [8]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#### Exploratory Data Analysis ####

**Exploratory Data Analysis (EDA) is an approach that is used to analyze the data and discover trends, patterns, or check assumptions in data with the help of statistical summaries and graphical representations². EDA refers to the critical process of performing initial investigations on data so as to discover patterns, to spot anomalies, to test hypothesis and to check assumptions with the help of summary statistics and graphical representations. **

In [9]:
import sweetviz as sv

In [10]:
report = sv.analyze(df)
report.show_html("./report.html")

                                             |                                             | [  0%]   00:00 ->…

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


#### Data Pre-Processing ####

In [11]:
data.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [12]:
# Fearture Engineering
from geopy.geocoders import Nominatim

In [13]:
geolocator = Nominatim(user_agent='geoapiExercises')

In [14]:
# test - my code
# location_list = []
# for i in range(df.shape[0]-20600):
#     location_list.append(geolocator.reverse(str(df['Latitude'][i])+','+str(df['Longitude'][i]))[0])
# print(location_list)

In [15]:
def location(cord):
    Lat = str(cord[0])
    Lon = str(cord[1])
    
    location = geolocator.reverse(Lat+','+Lon).raw['address'] # returns a dictionary
#     print(location)
    
    # if a values are missing replace them by a empty string
    
    if location.get('road') is None:
        location['road'] = None
        
    if location.get('county') is None:
        location['county'] = None
        
    updated_location['County'].append(location['county'])
    updated_location['Road'].append(location['road'])

In [16]:
# Create the pickle file

# import pickle
# updated_location = {"County":[],"Road":[]}

# for i,cord in enumerate(df.iloc[:,6:-1].values):
#     location(cord)
    
#     #continuously reading and saving
#     pickle.dump(updated_location, open('updated_location.pickle','wb'))
    
#     if i%100 == 0:
#         print(i)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980


In [16]:
# load the pickale module
import pickle

updated_location = pickle.load(open("updated_location.pickle", "rb"))

In [17]:
print(updated_location)

{'County': ['Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County', 'Alameda County'

In [18]:
updated_location.keys()

dict_keys(['County', 'Road'])

In [19]:
df_loc = pd.DataFrame(updated_location)

In [20]:
df_loc.tail(10)

Unnamed: 0,County,Road
990,Alameda County,Lerida Court
991,Alameda County,
992,Alameda County,
993,Alameda County,Brisa Street
994,Alameda County,Hillflower Drive
995,Alameda County,Redwood Road
996,Alameda County,North Livermore Avenue
997,Alameda County,Arthur H. Breed Junior Freeway
998,Alameda County,Wright Brothers Avenue
999,Alameda County,Airway Boulevard


In [21]:
df_loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   County  1000 non-null   object
 1   Road    986 non-null    object
dtypes: object(2)
memory usage: 15.8+ KB


In [22]:
# my test code
x = 0
for i in df_loc['Road']:
    if i == None:
        print(i, x)
    x+=1

None 284
None 288
None 289
None 395
None 571
None 573
None 575
None 644
None 923
None 935
None 959
None 987
None 991
None 992


In [23]:
# update the original dataframe with new features

for i in updated_location.keys():
    df[i] = updated_location[i]
    
df = df.sample(axis=0, frac=1)

df.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target,County,Road
813,3.7262,36.0,5.014235,0.964413,1002.0,3.565836,37.61,-122.03,1.649,Alameda County,Utica Street
611,3.851,52.0,5.269091,1.012727,1338.0,2.432727,37.74,-122.15,1.835,Alameda County,Hollister Court
403,7.8864,52.0,6.690972,0.96875,705.0,2.447917,37.9,-122.26,3.573,Alameda County,Wildcat Canyon Road
349,2.3281,44.0,4.795276,0.976378,763.0,3.003937,37.75,-122.17,0.888,Alameda County,Plymouth Street
49,1.775,40.0,2.6875,1.065341,700.0,1.988636,37.82,-122.27,1.125,Alameda County,Grove Shafter Freeway
264,2.3125,44.0,4.399491,1.053435,1240.0,3.155216,37.78,-122.21,1.028,Alameda County,Rosedale Avenue
589,3.7222,30.0,5.484155,1.100352,1444.0,2.542254,37.69,-122.1,1.958,Alameda County,Castro Valley Freeway
26,2.4597,49.0,4.728033,1.020921,607.0,2.539749,37.85,-122.28,0.938,Alameda County,66th Street
327,2.9063,42.0,4.590909,1.085859,646.0,3.262626,37.73,-122.18,0.8,Alameda County,105th Avenue
116,2.4234,28.0,3.02348,1.053582,2558.0,1.540036,37.83,-122.25,2.185,Alameda County,John Street


In [24]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'Target', 'County', 'Road'],
      dtype='object')

In [25]:
# drop unnecessary columns
df.drop(labels=['Latitude', 'Longitude'], axis=1, inplace=True)

In [26]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
813,3.7262,36.0,5.014235,0.964413,1002.0,3.565836,1.649,Alameda County,Utica Street
611,3.851,52.0,5.269091,1.012727,1338.0,2.432727,1.835,Alameda County,Hollister Court
403,7.8864,52.0,6.690972,0.96875,705.0,2.447917,3.573,Alameda County,Wildcat Canyon Road
349,2.3281,44.0,4.795276,0.976378,763.0,3.003937,0.888,Alameda County,Plymouth Street
49,1.775,40.0,2.6875,1.065341,700.0,1.988636,1.125,Alameda County,Grove Shafter Freeway


In [27]:
df.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Target         0
County         0
Road          14
dtype: int64

### Using classification algorithm to fill the missing categorical values ###

In [28]:
missing_idx = []

for i in range(df.shape[0]):
    if df["Road"][i] is None:
        missing_idx.append(i)

In [29]:
missing_idx

[284, 288, 289, 395, 571, 573, 575, 644, 923, 935, 959, 987, 991, 992]

In [30]:
# apply logistic reggression to fill missing values

# Independent Parameters
missing_road_X_train = np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i not in missing_idx ])

# Dependent Parameters
missing_road_y_train = np.array([ df['Road'][i] for i in range(df.shape[0]) if i not in missing_idx ])

missing_road_X_test = np.array([ [df['MedInc'][i], df['AveRooms'][i], df['AveBedrms'][i]] for i in range(df.shape[0]) if i in missing_idx ])

In [31]:
from sklearn.linear_model import SGDClassifier

# model initialization
model_1 = SGDClassifier()

# model training
model_1.fit(missing_road_X_train, missing_road_y_train)

missing_road_y_pred = model_1.predict(missing_road_X_test)

In [35]:
np.unique(missing_road_y_pred)

array(['11th Avenue', 'Glendora Avenue', 'Greenly Drive', 'High Street',
       'Meadowlark Drive', 'Ridgewood Road'], dtype='<U45')

In [36]:
# add the model back to the dataframe

for n,i in enumerate(missing_idx):
    df['Road'][i] = missing_road_y_pred[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Road'][i] = missing_road_y_pred[n]


The warning message you received is a pandas warning called SettingWithCopyWarning. It is triggered when you are assigning a value to a slice of a DataFrame that could be a copy of the original DataFrame instead of a view. This warning is meant to alert you to a potential pitfall where the assignment may not have the desired effect.

In your code snippet, the line df['Road'][i] = missing_road_y_pred[n] is attempting to assign a value to a specific element in the 'Road' column of the DataFrame df. However, this line of code can trigger the SettingWithCopyWarning if df is a result of a previous slicing operation on another DataFrame.

To address this warning, you can use the .loc accessor to explicitly indicate that you want to modify the original DataFrame instead of working on a copy. Here's an updated version of your code snippet that uses the .loc accessor:

for n, i in enumerate(missing_idx):
    df.loc[i, 'Road'] = missing_road_y_pred[n]
    
By using df.loc[i, 'Road'], you are explicitly referencing the original DataFrame df and modifying the specific element at index i in the 'Road' column.

Using the .loc accessor ensures that the assignment is done on the original DataFrame and helps avoid the SettingWithCopyWarning.

However, please note that this warning can sometimes be triggered even when the assignment is not problematic. It is always a good practice to double-check your code and make sure the assignments are behaving as intended.

In [37]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Target        0
County        0
Road          0
dtype: int64

In [39]:
# Label Encording
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Road'] = le.fit_transform(df['Road'])

In [40]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
813,3.7262,36.0,5.014235,0.964413,1002.0,3.565836,1.649,Alameda County,351
611,3.851,52.0,5.269091,1.012727,1338.0,2.432727,1.835,Alameda County,177
403,7.8864,52.0,6.690972,0.96875,705.0,2.447917,3.573,Alameda County,373
349,2.3281,44.0,4.795276,0.976378,763.0,3.003937,0.888,Alameda County,281
49,1.775,40.0,2.6875,1.065341,700.0,1.988636,1.125,Alameda County,163


In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['County'] = le.fit_transform(df['County'])

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
813,3.7262,36.0,5.014235,0.964413,1002.0,3.565836,1.649,0,351
611,3.851,52.0,5.269091,1.012727,1338.0,2.432727,1.835,0,177
403,7.8864,52.0,6.690972,0.96875,705.0,2.447917,3.573,0,373
349,2.3281,44.0,4.795276,0.976378,763.0,3.003937,0.888,0,281
49,1.775,40.0,2.6875,1.065341,700.0,1.988636,1.125,0,163


In [44]:
df['County'].unique()

array([0, 1])

### Using Regression(Random Forest) model

In [53]:
# Dependent Values
y = df.iloc[:,-3].values

df.drop(labels=['Target'], axis=1, inplace = True)

X = df.iloc[:,:].values

In [54]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,County,Road
813,3.7262,36.0,5.014235,0.964413,1002.0,3.565836,0,351
611,3.851,52.0,5.269091,1.012727,1338.0,2.432727,0,177
403,7.8864,52.0,6.690972,0.96875,705.0,2.447917,0,373
349,2.3281,44.0,4.795276,0.976378,763.0,3.003937,0,281
49,1.775,40.0,2.6875,1.065341,700.0,1.988636,0,163


In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [59]:
# train_test_split? - Documentation

In [60]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

RandomForestRegressor()

In [61]:
# model prediction
y_pred = model.predict(X_test)

In [62]:
# model accuracy
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.7624565548116014

### Add our ownData

In [63]:
inp = np.array([3.7262,36.0,5.014235,0.964413,1002.0,3.565836,0,351])
inp.shape

(8,)

In [65]:
inp = inp.reshape((1,-1))

In [66]:
model.predict(inp)

array([1.64395])