# Dragon Real Estate-Price Predictor

## Reading Data

In [1]:
import sklearn
import pandas as pd

In [2]:
housing = pd.read_csv("data.csv")

In [3]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
housing.shape

(506, 14)

In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       500 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [6]:
housing.CHAS

0      0
1      0
2      0
3      0
4      0
      ..
501    0
502    0
503    0
504    0
505    0
Name: CHAS, Length: 506, dtype: int64

In [7]:
housing.CHAS.value_counts()

0    471
1     35
Name: CHAS, dtype: int64

In [8]:
# housing.describe()

In [9]:
%matplotlib inline

In [10]:
# For plotting histogram
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(25,25))
# plt.show()

## Train-Test Splitting

In [11]:
import numpy as np

In [12]:
# One way of splitting training and testing data
# It is just for learning purpose
# def split_train_test(data, test_ratio):
#     np.random.seed(42) # Used to shuffle the data only once.
#     shuffled = np.random.permutation(len(data))
#     test_set_size = int(len(data)*test_ratio)
#     test_indices = shuffled[:test_set_size]
#     train_indices = shuffled[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]

In [13]:
# test_ratio = 20/100

In [14]:
# train_set, test_set = split_train_test(housing, test_ratio)

In [15]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in train set: {len(test_set)}")

Rows in train set: 404
Rows in train set: 102


***
***
This split with sklearn will work just fine.
***
But..!
***
But...!
***
But....!
***
There is a problem. For example just looke at "housing.CHAS.value_counts()" this is giving counts of different values in CHAS. CHAS is having 471 zeros(0s) and 35 ones(1s).
What if all the ones(1s) go inside test_set? Then our machine will not be trained for ones(1s).
So to avoid this we have to follow below method.

In [16]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing.CHAS):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [17]:
strat_test_set.CHAS.value_counts()

0    95
1     7
Name: CHAS, dtype: int64

In [18]:
strat_train_set.CHAS.value_counts()

0    376
1     28
Name: CHAS, dtype: int64

In [19]:
housing = strat_train_set.copy()

In [20]:
# 95/7, 376/28

## Looking for Correlations

In [21]:
corr_matrix = housing.corr()
corr_matrix.MEDV.sort_values(ascending=False)

MEDV       1.000000
RM         0.679506
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [22]:
from pandas.plotting import scatter_matrix
attributes = ["MEDV", "RM","ZN", "LSTAT"]
# scatter_matrix(housing[attributes], figsize=(24,16))

In [23]:
# housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)

## Trying Out Attribute Combinations

In [24]:
# housing['TAXRM'] = housing.TAX/housing.RM

In [25]:
# housing.head()

In [26]:
corr_matrix = housing.corr()
corr_matrix.MEDV.sort_values(ascending=False)

MEDV       1.000000
RM         0.679506
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [27]:
# housing.plot(kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)

In [28]:
housing = strat_train_set.drop("MEDV",axis=1)
housing_labels = strat_train_set["MEDV"].copy()

## Missing Attributes

### To take care of missing attributes, you have three options:
1. Get rid of the missing data points:
>Removing entire row. This we can do wherever the missing data points are 2 or 3 or less than 0.5% of the data.
2. Get rid of the whole attribute
>Removing entire column. If correlation coefficient is very less or near to zero, then we can remove entire column.
1. Set the value to some value(0, mean or median)

#### ***Please note that have to do these operations with training set and NOT with entire dataframe.***

In [29]:
# Option#1
a = housing.dropna(subset=["RM"])
a.shape
# Note that the original hosuing dataframe will remain unchanged

(398, 13)

In [30]:
# Option#2
b = housing.drop("RM", axis=1)
b.shape
# Note that the original hosuing dataframe will remain unchanged

(404, 12)

In [31]:
# Option#3-1
median = housing.RM.median()
print(median)
housing.RM.fillna(median)
# Note that the original hosuing dataframe will remain unchanged

6.2135


254    6.1080
348    6.6350
476    6.4840
321    6.3760
326    6.3120
        ...  
155    6.1520
423    6.1030
98     7.8200
455    6.5250
216    6.2135
Name: RM, Length: 404, dtype: float64

In [32]:
# housing.describe()

In [33]:
# Option#3-2 another way of filling empty cells or missing data with median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

SimpleImputer(strategy='median')

In [34]:
imputer.statistics_.shape

(13,)

In [35]:
X = imputer.transform(housing)

In [36]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [37]:
# housing_tr.describe()

## Scikit-learn Design

#### Primarily, there are three types of objects:
1. Estimators: It estimates some parameters based on a dataset. Eg. imputer.
>It has a fit method and transform method.<br>
>Fit method - Fits the dataset and calculates internal parameters.

2. Transformers:
>Transform method takes input and returns output based on the learnings from fit().<br>
>It also has a convenience function called fit_transform() which fits and then transforms.

3. Predictors:
>LinearRegression model is an example of predictor. fit() and predict() are two common functions.<br>
>It also gives score() function which will evaluate the predictions.

## Feature Scaling

#### Primarily, two types of feature scaling methods:
1. Min-max scaling (Normalization):
>Formula is ((value-min)/(max-min))<br>
>For this Sklearn provides a class MinMaxScaler for this.
2. Standardization(Z-score):
>Formula is ((value-mean)/std)<br>
>For this Sklearn provides a class Standard Scaler for this.

## Creating a Pipeline

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    # ..... add as many as you want in your pipeline
    ('std_scaler', StandardScaler()),
])

In [39]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [40]:
housing_num_tr.shape

(404, 13)

## Selecting A Desired Model For Dragon Real Estates

In [41]:
from sklearn.linear_model import LinearRegression

fmodel = LinearRegression() #Failed Model
fmodel.fit(housing_num_tr, housing_labels) #Failed Model


from sklearn.tree import DecisionTreeRegressor

f1model = DecisionTreeRegressor() #Failed Model
f1model.fit(housing_num_tr, housing_labels) #Failed Model


from sklearn.ensemble import RandomForestRegressor

f2model = RandomForestRegressor()
f2model.fit(housing_num_tr, housing_labels)

model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

RandomForestRegressor()

In [42]:
some_data = housing.iloc[:5]

In [43]:
some_labels = housing_labels[:5]

In [44]:
prepared_data = my_pipeline.transform(some_data)

In [45]:
fmodel.predict(prepared_data)

array([23.96407132, 27.27088316, 20.57533342, 25.06071996, 23.74779285])

In [46]:
f1model.predict(prepared_data)

array([21.9, 24.5, 16.7, 23.1, 23. ])

In [47]:
f2model.predict(prepared_data)

array([22.483, 25.234, 16.458, 23.149, 23.447])

In [48]:
model.predict(prepared_data)

array([22.483, 25.268, 16.626, 23.402, 23.569])

In [49]:
list(some_labels)

[21.9, 24.5, 16.7, 23.1, 23.0]

## Evaluating The Model

In [50]:
from sklearn.metrics import mean_squared_error

In [51]:
fhousing_predictions = fmodel.predict(housing_num_tr)
fmse = mean_squared_error(housing_labels, fhousing_predictions)
frmse = np.sqrt(fmse)

In [52]:
fmse #Not good model because of underfitting

23.367623681832963

In [53]:
frmse

4.834007000598258

In [54]:
f1housing_predictions = f1model.predict(housing_num_tr)
f1mse = mean_squared_error(housing_labels, f1housing_predictions)
f1rmse = np.sqrt(f1mse)

In [55]:
f1mse #Not good model because of overfitting

0.0

In [56]:
f1rmse

0.0

In [57]:
f2housing_predictions = f2model.predict(housing_num_tr)
f2mse = mean_squared_error(housing_labels, f2housing_predictions)
f2rmse = np.sqrt(f2mse)

In [58]:
f2rmse

1.2124652970434449

## Using Better Evaluation Technique - Cross Validaion

In [59]:
from sklearn.model_selection import cross_val_score

In [60]:
# Cross validation on DecisionTreeRegressor Model
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [61]:
rmse_scores

array([2.78979106, 2.85427192, 4.49316344, 2.64029538, 3.6593917 ,
       2.63548307, 4.91069985, 3.22687358, 3.52787389, 3.06156686])

In [62]:
# Cross validation on LinearRegression Model
fscores = cross_val_score(fmodel, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
frmse_scores = np.sqrt(-fscores)

In [63]:
frmse_scores

array([4.19313569, 4.28136187, 5.09372388, 3.84155417, 5.34978404,
       4.39564029, 7.45199139, 5.49758654, 4.15117909, 6.06963137])

In [64]:
# Cross validation on RandomForestRegressor Model
f2scores = cross_val_score(f2model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
f2rmse_scores = np.sqrt(-f2scores)

In [65]:
f2rmse_scores

array([2.96828678, 2.78830979, 4.58522299, 2.46404526, 3.59512869,
       2.59026043, 4.57882113, 3.36578778, 3.01495484, 2.98950455])

In [66]:
def print_score(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard Deviation:",scores.std())

In [67]:
# Decision Tree Model Output
print_score(rmse_scores)

Scores: [2.78979106 2.85427192 4.49316344 2.64029538 3.6593917  2.63548307
 4.91069985 3.22687358 3.52787389 3.06156686]
Mean: 3.379941074730797
Standard Deviation: 0.7438517069667702


In [68]:
# Linear Regression Model Output
print_score(frmse_scores)

Scores: [4.19313569 4.28136187 5.09372388 3.84155417 5.34978404 4.39564029
 7.45199139 5.49758654 4.15117909 6.06963137]
Mean: 5.032558832604023
Standard Deviation: 1.0545004896752692


In [69]:
# Random Forest Regressor Model Output
print_score(f2rmse_scores)

Scores: [2.96828678 2.78830979 4.58522299 2.46404526 3.59512869 2.59026043
 4.57882113 3.36578778 3.01495484 2.98950455]
Mean: 3.2940322238994435
Standard Deviation: 0.7162627840666584


## Saving the Model

In [71]:
from joblib import dump, load
dump(model, 'Dragon.joblib')

['Dragon.joblib']

## Testing the Model on Test Data

In [74]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set.MEDV.copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [75]:
final_rmse

2.8716288487717256

In [77]:
dict(zip(Y_test, final_predictions.round(2)))

{16.5: 23.88,
 10.2: 11.78,
 30.1: 25.07,
 23.0: 21.3,
 14.4: 18.34,
 15.6: 15.01,
 19.4: 15.85,
 14.1: 14.84,
 30.3: 32.11,
 35.2: 41.53,
 23.1: 19.29,
 13.8: 11.64,
 25.0: 27.05,
 27.9: 25.63,
 19.5: 19.11,
 12.3: 10.77,
 32.2: 31.96,
 13.5: 14.3,
 23.8: 19.7,
 21.7: 21.23,
 19.2: 19.93,
 10.4: 17.24,
 23.2: 21.82,
 18.6: 18.63,
 28.5: 32.49,
 15.2: 15.91,
 32.0: 33.61,
 7.2: 9.92,
 34.6: 33.44,
 20.1: 24.58,
 20.6: 22.07,
 23.6: 31.37,
 13.1: 10.81,
 12.7: 11.33,
 43.1: 43.59,
 24.7: 24.48,
 22.2: 22.67,
 44.0: 43.84,
 28.1: 24.03,
 31.0: 29.55,
 23.4: 20.92,
 33.1: 33.03,
 41.7: 44.75,
 18.7: 20.16,
 19.9: 18.14,
 21.2: 21.07,
 13.6: 14.49,
 20.3: 20.95,
 17.8: 15.06,
 27.1: 24.89,
 31.5: 32.45,
 50.0: 46.63,
 29.1: 29.22,
 18.9: 20.01,
 20.4: 21.32,
 17.2: 18.88,
 36.2: 28.14,
 14.6: 14.62,
 33.2: 34.38,
 21.5: 20.59,
 37.3: 34.15,
 27.0: 26.24,
 22.0: 23.39,
 24.3: 21.4,
 19.8: 22.98,
 33.3: 35.06,
 7.0: 12.99,
 20.9: 20.11,
 21.1: 21.05,
 11.9: 21.0,
 11.7: 13.95,
 21.6: 23.26,


In [78]:
prepared_data[0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24072966, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

## Using Model

In [79]:
from joblib import dump, load
import numpy as np
model = load("Dragon.joblib")

features = np.array([[-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -15.24072966, -99.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -25.86091034]])
model.predict(features)

array([24.975])