## Real Estate Price Predictor

In [17]:
import pandas as pd

In [18]:
housing = pd.read_csv("housing.csv")

In [19]:
#    1. CRIM      per capita crime rate by town
#    2. ZN        proportion of residential land zoned for lots over 
#                 25,000 sq.ft.
#    3. INDUS     proportion of non-retail business acres per town
#    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
#                 river; 0 otherwise)
#    5. NOX       nitric oxides concentration (parts per 10 million)
#    6. RM        average number of rooms per dwelling
#    7. AGE       proportion of owner-occupied units built prior to 1940
#    8. DIS       weighted distances to five Boston employment centres
#    9. RAD       index of accessibility to radial highways
#    10. TAX      full-value property-tax rate per $10,000
#    11. PTRATIO  pupil-teacher ratio by town
#    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
#                 by town
#    13. LSTAT    % lower status of the population
#    14. MEDV     Median value of owner-occupied homes in $1000's

In [20]:
housing.head()

Unnamed: 0,0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 396.90 4.98 24.00
0,0.02731 0.00 7.070 0 0.4690 6.4210 78...
1,0.02729 0.00 7.070 0 0.4690 7.1850 61...
2,0.03237 0.00 2.180 0 0.4580 6.9980 45...
3,0.06905 0.00 2.180 0 0.4580 7.1470 54...
4,0.02985 0.00 2.180 0 0.4580 6.4300 58...


In [21]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 1 columns):
 #   Column                                                                                            Non-Null Count  Dtype 
---  ------                                                                                            --------------  ----- 
 0    0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00  505 non-null    object
dtypes: object(1)
memory usage: 4.1+ KB


In [22]:
housing.keys()

Index([' 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00'], dtype='object')

In [23]:
housing['CHAS'].value_counts()

In [24]:
housing.describe()

Unnamed: 0,0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 396.90 4.98 24.00
count,505
unique,505
top,0.02731 0.00 7.070 0 0.4690 6.4210 78...
freq,1


In [25]:
%matplotlib inline

In [26]:
import matplotlib.pyplot as plt

In [27]:
housing.hist(bins=50, figsize=(20,15))

# #Train-Test Splitting

In [28]:
#for learning purpose
import numpy as np
def split_train_test(datacopy, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(datacopy))
    print(shuffled)
    test_set_size = int(len(datacopy) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return datacopy.iloc[train_indices], datacopy.iloc[test_indices]

In [29]:
train_set, test_set = split_train_test(housing,0.2)

[173 274 490  72 305  76 476 140 470 499 218   9 480  78 322 312 124 384
 195 447 271 278  30 500 420 444  79 318 210 496 172 453 374 358 494 153
   2 335 208  73 495 451 204  68  90 299  33  70 469   0  11 281  22 101
 268 484 441 290  84 245  63  55 229  18 410 209 394  82  39 455  46 412
 314 354  77 397 104 203 380 488  69 407 255 391 311 234 473 323  93 137
 176 336 131 502 364 132 331 411 503 349  86  75 442  15 325 388  19 324
 334  56 338 408 333 181 227 426 180  25 329 238 244 250 417 117  42 321
 346 182 155 280 126 448  31 113 148 429 400  57 194  24  17 297  66 211
 403  94 154 440  23 225 432 446   5 116  45  16 467 361   3 294 185  60
 110 320 265  29 262 477  26   7 457 108  37 157 471 118 114 175 192 272
 144 489 382 355 277 220 449 141 368  67 360 168 498 378 365 193 249 109
 433 145  92 152 222 409  83 248 165 163 199 231  74 310 454 253 119 284
 301 482 356 402 228 261 237 437 475  36 196 139 367 247 287 377  59 111
  89 266   6 456 347 340 158 150 177 396 184 317  1

In [30]:
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

Rows in train set: 404
Rows in test set: 101



In [31]:
from sklearn.model_selection import train_test_split
train_set, test_set =train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

Rows in train set: 404
Rows in test set: 101



In [32]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set=  housing.loc[train_index]
    strat_test_set=  housing.loc[test_index]

KeyError: 'CHAS'

In [None]:
strat_test_set.describe()

In [None]:
strat_test_set['CHAS'].value_counts ()

In [None]:
strat_train_set['CHAS'].value_counts ()

In [None]:
95/7

In [None]:
376/28 #-->> almost same

In [None]:
housing = strat_train_set.copy()

# Looking for Correlations

In [None]:
numeric_cols = housing.select_dtypes(include=np.number)
corr_matrix = numeric_cols.corr()

In [None]:
corr_matrix = housing.corr(numeric_only=True)

In [None]:
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
#from pandas.plotting import scatter_matrix
#attributes = ["RM", "MEDV", "LSTAT", "ZN"]
#scatter_matrix(housing[attributes],figsize = (12,8))

In [None]:
## Standardize the dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
scaler.fit(train_set)

In [None]:
train_set = scaler.fit_transform(train_set)

In [None]:
test_set = scaler.transform(test_set)

In [None]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

In [None]:
rf_regressor = RandomForestRegressor()
#rf_regressor.fit(train_set, test_set)

# Trying out attribute combinations

In [None]:
housing["TAXRM"]= housing["TAX"]/housing["RM"]

In [None]:
housing.head()

In [None]:
corr_matrix = housing.corr(numeric_only=True)

In [None]:
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
housing.plot (kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)

In [None]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_num_labels = strat_train_set["MEDV"].copy()

# Missing Attributes

In [None]:
a=housing.dropna(subset=["RM"]) #Option 1
a.shape

In [None]:
housing.drop("RM", axis=1).shape #Option 2
#No RM column 

In [None]:
median = housing["RM"].median()

In [None]:
median

In [None]:
housing["RM"].fillna(median)

In [None]:
housing.shape

In [None]:
housing.describe()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(housing)

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [None]:
housing_tr.describe()

# Scikit-learn Design

Primarily, three types of objects
1. Estimators - It estimates some parameter based on a dataset. Eg. imputer. It has a fit method and transform method. Fit method - Fits the dataset and calculates internal parameters
2. Transformers - transform method takes input and returns output based on the learnings from fit(). It also has a convenience function called fit transform() which fits and then transforms.
3. Predictors - LinearRegression model is an example of predictor. fit() and predict() are two common functions. It also gives score() function which will evaluate the predictions.

# Feature Scaling

Primarily, two types of feature scaling methods:
1. Min-max scaling (Normalization)
(value - min)/ (max - min)
Sklearn provides a class called MinMaxScaler for this
2. Standardization (value - mean) /std
Sklearn provides a class called StandardScaler for this

# Creating a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [None]:
housing_num_tr.shape

# Selecting a desired model 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr,housing_num_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_num_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
list(some_labels)

# Evaluating the Model

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_num_labels, housing_predictions)
rmse = np.sqrt(mse)

In [None]:
rmse

# Using better evaluation technique - Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr,housing_num_labels, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

In [None]:
print_scores(rmse_scores)

In [None]:
#Error cannot be zero, if it becomes zero then that is the case of overfitting

# Saving the Model

In [None]:
from joblib import dump, load
dump(model, 'House_Price_Prediction.joblib')

# Testing the model in test data

In [None]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict (X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, Y_test)

In [None]:
final_rmse

In [None]:
prepared_data[0]

# Using the Model

In [None]:
from joblib import dump, load
import numpy as np
model = load('House_Price_Prediction.joblib')

In [None]:
features = np.array([[-0.43942006,  3.20174225, -0.82244364, -0.27288841, -1.42262747,
       -0.23979304, -1.31238873,  2.81038006, -2.48705688, -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(features)

# New Data Prediction 

In [None]:
#housing.RM[0].reshape(1,-1)

In [None]:
#input_data = (housing.CHAS[0].reshape(1,-1))

In [None]:
#scaled_data = scaler.transform(input_data)

# Pickling the model file for Deployment

In [None]:
#import pickle

In [None]:
#pickle.dump(regression,open('regmodel.pkl','wb'))

In [None]:
#pickled_model=pickle.load(open('regmodel.pkl', 'rb'))