## Real Estate Price Predictor

In [1]:
import pandas as pd

In [2]:
# Read the CSV file into Python
housing = pd.read_csv('data.csv')

In [3]:
# Prints the first 5 rows of housing data with column names
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
# Prints info about selected dataframe
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
# Selects the 'CHAS' column data from housing dataframe
housing['CHAS']

0      0
1      0
2      0
3      0
4      0
      ..
501    0
502    0
503    0
504    0
505    0
Name: CHAS, Length: 506, dtype: int64

In [6]:
# Prints the different values and their counts in the CHAS column
housing['CHAS'].value_counts()

CHAS
0    471
1     35
Name: count, dtype: int64

In [7]:
# Prints the statistical info about selected dataframe like - count, mean, min, max, etc..
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284341,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.705587,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.884,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.208,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.625,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [8]:
%matplotlib inline

In [9]:
import matplotlib.pyplot as plt

In [10]:
# Draws a histogram of 'AGE' data in housing dataframe. 
import numpy as np
# Rule of Thumb, Square Root Method to find out the bins value for given data
bins_value = int(np.sqrt(len(housing['AGE'])))

# housing['AGE'].hist(bins=bins_value, figsize=(20,15))

In [11]:
# housing.hist(bins=50, figsize=(20,15))

## Train-Test Splitting

In [12]:
# np.random.seed(42)
# # Splits the training and testing data
# def split_train_test(data, test_ratio):
#     shuffled = np.random.permutation(len(data)) # Takes a list of numbers from 0 to 506 and shuffles them 
#     test_set_size = int(len(data) * test_ratio) # Takes a ratio (e.g. 0.20) and finds that percentage of 506 (e.g 112)
#     test_indices = shuffled[:test_set_size] # A list of indices (multiple indexes) from 0 to test_set_size (e.g. 112) for test dataset
#     train_indices = shuffled[test_set_size:] # A list of indices (multiple indexes) from test_set_size (e.g. 112) to 506 for training dataset
#     return data.iloc[train_indices], data.iloc[test_indices] # Selects a list of items from given indices for training and testing data

In [13]:
# train_set, test_set = split_train_test(housing, 0.2)

In [14]:
# # We always try to keep more data in our training set and less data in our testing set
# print(f"Rows in Train Set: {len(train_set)}")
# print(f"Rows in Test Set: {len(test_set)}")

In [15]:
housing.shape

(506, 14)

In [16]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state = 42)

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit
split_data = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split_data.split(housing, housing['CHAS']):
    strat_train_data = housing.loc[train_index]
    strat_test_data = housing.loc[test_index]

In [18]:
strat_train_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,404.0,404.0,404.0,404.0,404.0,399.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.279481,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609,22.509406
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.716784,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574,9.385531
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73,5.0
25%,0.086962,0.0,5.19,0.0,0.453,5.8765,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475,16.6
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57,21.15
75%,3.731923,12.5,18.1,0.0,0.631,6.6305,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025,25.0
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98,50.0


In [19]:
strat_test_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0
mean,3.655942,13.45098,10.312255,0.068627,0.541353,6.303353,66.733333,3.98846,8.813725,391.980392,18.385294,369.670196,12.104314,22.62549
std,10.400966,27.503241,6.761154,0.254068,0.111397,0.662996,27.772183,2.131247,8.614667,167.837379,2.310604,68.075774,6.759257,8.452344
min,0.00906,0.0,0.46,0.0,0.385,4.138,6.5,1.137,1.0,188.0,12.6,3.65,2.47,5.0
25%,0.057827,0.0,4.95,0.0,0.448,5.91275,45.85,2.22365,4.0,270.0,16.8,377.685,7.48,18.925
50%,0.17615,0.0,7.76,0.0,0.515,6.176,71.1,3.42295,5.0,307.0,19.15,393.74,10.565,21.5
75%,2.061955,0.0,18.1,0.0,0.61275,6.5395,93.5,5.609225,8.0,461.0,20.2,396.9,16.2675,25.0
max,88.9762,90.0,27.74,1.0,0.871,8.725,100.0,10.5857,24.0,711.0,22.0,396.9,37.97,50.0


In [20]:
housing = strat_train_data.copy()
housing.shape

(404, 14)

## Looking for Correlation

In [21]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680857
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [22]:
# corr_matrix.plot()

In [23]:
# from pandas.plotting import scatter_matrix

In [24]:
# attributes = ['MEDV', 'RM', 'ZN', 'LSTAT']
# scatter_matrix(housing[attributes], figsize = (12,8))
# When attributes are compared against themselves, we get a histogram of their data

In [25]:
# housing.plot(kind='scatter', x='RM', y='MEDV', alpha=0.8)
# It draws the correlation between MEDV(value) and RM(no. of rooms)
# We can see that the data plotted has some descrepencies.

In [26]:
# Houses with 9 rooms are available at 50 and those with 5 rooms are also available at 50
# This is clearly not possible, pointing towards some form of data capping by the data collection team.

## Attribute Combinations

In [27]:
# housing['TAXRM'] = housing['TAX']/housing['RM']

In [28]:
# housing['TAXRM'].plot()

In [29]:
# housing['TAXRM'].describe()

In [30]:
# housing['TAX'].describe()

In [31]:
# housing['RM'].describe()

In [32]:
# To handle missing values in RM attribute, we have 3 options:
    # 1. Get rid of missing data entries
    # 2. Get rid of the attribute entirely
    # 3. Set some value for the missing data

In [33]:
# 1. We already have a small dataset, we can't remove data points
# 2. RM is an essential attribute so we can't remove it
# 3. We CAN set some value for missing data

In [34]:
# Approach 1:
# new_set1 = housing.dropna(subset=['RM'])
# new_set1.shape

In [35]:
# Approach 2:
# new_set2 = housing.drop('RM', axis=1)
# new_set2.shape

In [36]:
# Approach 3:
# median = housing['RM'].median()
# new_set3 = housing['RM'].fillna(median)
# new_set3.shape

In [37]:
housing

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.3440,4,280,17.0,390.94,5.99,24.5
476,4.87141,0.0,18.10,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.90,6.87,23.1
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.90,6.15,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,3.53501,0.0,19.58,1,0.871,6.152,82.6,1.7455,5,403,14.7,88.01,15.02,15.6
423,7.05042,0.0,18.10,0,0.614,6.103,85.1,2.0218,24,666,20.2,2.52,23.29,13.4
98,0.08187,0.0,2.89,0,0.445,7.820,36.9,3.4952,2,276,18.0,393.53,3.57,43.8
455,4.75237,0.0,18.10,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1


In [38]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(housing)

In [39]:
imputer.statistics_

array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
       6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
       1.90000e+01, 3.90955e+02, 1.15700e+01, 2.11500e+01])

In [40]:
X = imputer.transform(housing)

In [41]:
housing_newDF = pd.DataFrame(X, columns = housing.columns)

In [42]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,404.0,404.0,404.0,404.0,404.0,399.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.279481,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609,22.509406
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.716784,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574,9.385531
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73,5.0
25%,0.086962,0.0,5.19,0.0,0.453,5.8765,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475,16.6
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57,21.15
75%,3.731923,12.5,18.1,0.0,0.631,6.6305,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025,25.0
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98,50.0


In [43]:
housing_newDF.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.278609,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609,22.509406
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.712366,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574,9.385531
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73,5.0
25%,0.086962,0.0,5.19,0.0,0.453,5.87875,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475,16.6
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57,21.15
75%,3.731923,12.5,18.1,0.0,0.631,6.63,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025,25.0
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98,50.0


In [44]:
# As we can see the missing data has been filled in the RM field.

## SciKit-Learn Design


In [45]:
# 3 Types of objects:

# 1. Estimators - It estimates some parameters based on a dataset (e.g. Imputer)
    # Fit Method: Fits the dataset and calculates internal parameters
    # Transform Method: Takes input and returns output based on the learnings from fit()
    # Fit_Transform: It also has a convenience function which fits then transforms.
# 2. Transformers - 
# 3. Predictors - LinearRegression model is an example of predictor.
    # Fit: Fots into the dataset
    # Predict: Gives score function whichc will evaluate the prediction.

## Feature Scaling

In [46]:
# 2 Types:
# These are basically normalization techniques)

#     1. Standardization: (StandardScaler method)
            # z = (x - μ) / σ
            # Where:
            #     x is the original value of the feature.
            #     μ is the mean of the feature's values.
            #     σ is the standard deviation of the feature's values.

#     2. Min-Max Scaling(Normalization): (MinMaxScaler method)
            # x_scaled = (x - min) / (max - min)
            # Where:
            #     x is the original value of the feature.
            #     μ is the mean of the feature's values.
            #     σ is the standard deviation of the feature's values.

## Creating a Pipeline

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

myPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    # Add as many as you want in the pipeline
    ('std_scalar', StandardScaler()),
])

In [48]:
housing

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.3440,4,280,17.0,390.94,5.99,24.5
476,4.87141,0.0,18.10,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.90,6.87,23.1
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.90,6.15,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,3.53501,0.0,19.58,1,0.871,6.152,82.6,1.7455,5,403,14.7,88.01,15.02,15.6
423,7.05042,0.0,18.10,0,0.614,6.103,85.1,2.0218,24,666,20.2,2.52,23.29,13.4
98,0.08187,0.0,2.89,0,0.445,7.820,36.9,3.4952,2,276,18.0,393.53,3.57,43.8
455,4.75237,0.0,18.10,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1


In [49]:
housing_num_tr = myPipeline.fit_transform(housing)

In [50]:
housing_num_tr.shape

(404, 14)

In [51]:
# housing = strat_train_data.drop('MEDV', axis=1)
housing_labels = strat_train_data['MEDV'].copy()

In [52]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(housing_num_tr, housing_labels)

In [53]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data.shape

(5, 13)

In [54]:
some_labels.shape

(5,)

In [55]:
prepared_data = myPipeline.transform(some_data)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- MEDV


In [None]:
model.predict(prepared_data)