## Real Estate Price Predictor

In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv("/Users/sanchari/Documents/ML Project/datacopy2.csv")

In [3]:
#    1. CRIM      per capita crime rate by town
#    2. ZN        proportion of residential land zoned for lots over 
#                 25,000 sq.ft.
#    3. INDUS     proportion of non-retail business acres per town
#    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
#                 river; 0 otherwise)
#    5. NOX       nitric oxides concentration (parts per 10 million)
#    6. RM        average number of rooms per dwelling
#    7. AGE       proportion of owner-occupied units built prior to 1940
#    8. DIS       weighted distances to five Boston employment centres
#    9. RAD       index of accessibility to radial highways
#    10. TAX      full-value property-tax rate per $10,000
#    11. PTRATIO  pupil-teacher ratio by town
#    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
#                 by town
#    13. LSTAT    % lower status of the population
#    14. MEDV     Median value of owner-occupied homes in $1000's

In [4]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [40]:
housing.keys()

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [7]:
housing['CHAS'].value_counts()

0    471
1     35
Name: CHAS, dtype: int64

In [8]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.166534,5.93915,0.06917,0.554717,6.284341,68.574911,3.696234,4.332016,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,22.99097,2.75991,0.253994,0.115855,0.705587,28.148847,1.999684,1.417166,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.01,0.0,0.385,3.561,2.9,0.5857,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,3.6675,0.0,0.449,5.884,45.025,2.073715,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.001,6.41,0.0,0.538,6.208,77.5,3.1073,4.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,8.1,0.0,0.624,6.625,94.075,5.112625,5.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,95.0,9.9,1.0,0.871,8.78,100.0,9.2229,8.0,711.0,22.0,396.9,37.97,50.0


In [9]:
%matplotlib inline

In [10]:
#import matplotlib.pyplot as plt

In [11]:
#housing.hist(bins=50, figsize=(20,15))

# #Train-Test Splitting

In [12]:
#for learning purpose
import numpy as np
def split_train_test(datacopy, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(datacopy))
    print(shuffled)
    test_set_size = int(len(datacopy) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return datacopy.iloc[train_indices], datacopy.iloc[test_indices]

In [13]:
train_set, test_set = split_train_test(housing,0.2)

[173 274 491  72 452  76 316 140 471 500 218   9 414  78 323 473 124 388
 195 448 271 278  30 501 421 474  79 454 210 497 172 320 375 362 467 153
   2 336 208  73 496 307 204  68  90 390  33  70 470   0  11 281  22 101
 268 485 442 290  84 245  63  55 229  18 351 209 395  82  39 456  46 481
 444 355  77 398 104 203 381 489  69 408 255 392 312 234 460 324  93 137
 176 417 131 346 365 132 371 412 436 411  86  75 477  15 332 423  19 325
 335  56 437 409 334 181 227 434 180  25 493 238 244 250 418 117  42 322
 347 182 155 280 126 329  31 113 148 432 338  57 194  24  17 298  66 211
 404  94 154 441  23 225 433 447   5 116  45  16 468 360   3 405 185  60
 110 321 265  29 262 478  26   7 492 108  37 157 472 118 114 175 192 272
 144 373 383 356 277 220 450 141 369  67 361 168 499 394 400 193 249 109
 420 145  92 152 222 304  83 248 165 163 199 231  74 311 455 253 119 284
 302 483 357 403 228 261 237 386 476  36 196 139 368 247 287 378  59 111
  89 266   6 364 503 341 158 150 177 397 184 318  1

In [14]:
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

Rows in train set: 405
Rows in test set: 101



In [15]:
from sklearn.model_selection import train_test_split
train_set, test_set =train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

Rows in train set: 404
Rows in test set: 102



In [16]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set=  housing.loc[train_index]
    strat_test_set=  housing.loc[test_index]

In [17]:
strat_test_set.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0
mean,3.655942,13.451441,5.90049,0.068627,0.541461,6.303353,66.733343,3.890425,4.107843,391.980392,18.385294,369.670196,12.104314,22.62549
std,10.400966,27.503013,2.560881,0.254068,0.111289,0.662996,27.772166,2.05334,1.597689,167.837379,2.310604,68.075774,6.759257,8.452344
min,0.00906,0.0,0.46,0.0,0.385,4.138,6.5,0.5857,1.0,188.0,12.6,3.65,2.47,5.0
25%,0.057827,0.0,3.92,0.0,0.448,5.91275,45.85,2.2008,4.0,270.0,16.8,377.685,7.48,18.925
50%,0.17615,0.001,6.2,0.0,0.515,6.176,71.1,3.3925,4.0,307.0,19.15,393.74,10.565,21.5
75%,2.061955,0.002,8.1,0.0,0.61275,6.5395,93.5,5.46895,5.0,461.0,20.2,396.9,16.2675,25.0
max,88.9762,90.0,9.69,1.0,0.871,8.725,100.0,9.2203,8.0,711.0,22.0,396.9,37.97,50.0


In [18]:
strat_test_set['CHAS'].value_counts ()

0    95
1     7
Name: CHAS, dtype: int64

In [19]:
strat_train_set['CHAS'].value_counts ()

0    376
1     28
Name: CHAS, dtype: int64

In [20]:
95/7

13.571428571428571

In [21]:
376/28 #-->> almost same

13.428571428571429

In [22]:
housing = strat_train_set.copy()

# Looking for Correlations

In [23]:
numeric_cols = housing.select_dtypes(include=np.number)
corr_matrix = numeric_cols.corr()

In [24]:
corr_matrix = housing.corr(numeric_only=True)

In [25]:
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680857
B          0.361761
ZN         0.335620
DIS        0.252920
CHAS       0.205066
RAD        0.103903
INDUS     -0.336047
AGE       -0.364597
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [26]:
#from pandas.plotting import scatter_matrix
#attributes = ["RM", "MEDV", "LSTAT", "ZN"]
#scatter_matrix(housing[attributes],figsize = (12,8))

In [27]:
## Standardize the dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [54]:
scaler.fit(train_set)

In [55]:
train_set = scaler.fit_transform(train_set)

In [56]:
test_set = scaler.transform(test_set)

In [57]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

In [58]:
rf_regressor = RandomForestRegressor()
#rf_regressor.fit(train_set, test_set)

# Trying out attribute combinations

In [None]:
housing["TAXRM"]= housing["TAX"]/housing["RM"]

In [None]:
housing.head()

In [None]:
corr_matrix = housing.corr(numeric_only=True)

In [None]:
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
housing.plot (kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)

In [None]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_num_labels = strat_train_set["MEDV"].copy()

# Missing Attributes

In [None]:
a=housing.dropna(subset=["RM"]) #Option 1
a.shape

In [None]:
housing.drop("RM", axis=1).shape #Option 2
#No RM column 

In [None]:
median = housing["RM"].median()

In [None]:
median

In [None]:
housing["RM"].fillna(median)

In [None]:
housing.shape

In [None]:
housing.describe()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(housing)

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [None]:
housing_tr.describe()

# Scikit-learn Design

Primarily, three types of objects
1. Estimators - It estimates some parameter based on a dataset. Eg. imputer. It has a fit method and transform method. Fit method - Fits the dataset and calculates internal parameters
2. Transformers - transform method takes input and returns output based on the learnings from fit(). It also has a convenience function called fit transform() which fits and then transforms.
3. Predictors - LinearRegression model is an example of predictor. fit() and predict() are two common functions. It also gives score() function which will evaluate the predictions.

# Feature Scaling

Primarily, two types of feature scaling methods:
1. Min-max scaling (Normalization)
(value - min)/ (max - min)
Sklearn provides a class called MinMaxScaler for this
2. Standardization (value - mean) /std
Sklearn provides a class called StandardScaler for this

# Creating a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [None]:
housing_num_tr.shape

# Selecting a desired model 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr,housing_num_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_num_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
list(some_labels)

# Evaluating the Model

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_num_labels, housing_predictions)
rmse = np.sqrt(mse)

In [None]:
rmse

# Using better evaluation technique - Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr,housing_num_labels, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

In [None]:
print_scores(rmse_scores)

In [None]:
#Error cannot be zero, if it becomes zero then that is the case of overfitting

# Saving the Model

In [None]:
from joblib import dump, load
dump(model, 'House_Price_Prediction.joblib')

# Testing the model in test data

In [None]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict (X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, Y_test)

In [None]:
final_rmse

In [None]:
prepared_data[0]

# Using the Model

In [None]:
from joblib import dump, load
import numpy as np
model = load('House_Price_Prediction.joblib')

In [None]:
features = np.array([[-0.43942006,  3.20174225, -0.82244364, -0.27288841, -1.42262747,
       -0.23979304, -1.31238873,  2.81038006, -2.48705688, -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(features)

# New Data Prediction 

In [80]:
#housing.RM[0].reshape(1,-1)

In [72]:
#input_data = (housing.CHAS[0].reshape(1,-1))

In [73]:
#scaled_data = scaler.transform(input_data)

# Pickling the model file for Deployment

In [74]:
#import pickle

In [78]:
#pickle.dump(regression,open('regmodel.pkl','wb'))

In [79]:
#pickled_model=pickle.load(open('regmodel.pkl', 'rb'))