## Boston House Price Prediction

In [None]:
# We have data from a XYZ Company and we need to build a model from given features to predict price of the property.

#Clearly this is a Supervised Learning and Regression Task.

#A typical performance measure for regression task is RMSE,i.e root mean square error. And hence we will be using the
#same performance measure for this

In [None]:
import pandas as pd

In [None]:
housing = pd.read_csv(r'C:\Users\chhay\OneDrive\Desktop\Housing Data.csv')

In [None]:
housing.head()

In [None]:
housing.keys()

In [None]:
housing.info()

In [None]:
import numpy as np

In [None]:
for col in housing.columns:
        missing_pt = np.mean(housing[col].isnull())
        print('{} , {}%'.format(col,missing_pt))

In [None]:
housing.isnull().sum()

In [None]:
housing['CHAS'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline     

In [None]:
import matplotlib.pyplot as plt

In [None]:
housing.hist(bins=70 , figsize=(20,15))

## Train-Test splitting

In [None]:
import numpy as np

In [None]:
def split_train_test(data , test_ratio):
    np.random.seed(42)                              #the more we run trsin_test_splitting runs, every time new shuffled data comes up.
                                                    #in order to avoid this. random.seed function fixes the shuffled data
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices] , data.iloc[test_indices]

In [None]:
train_set , test_set = split_train_test(housing , 0.2)

In [None]:
print(f" no of rows in training set is : {len(train_set)}\n no. of rows in testing set is : {len(test_set)}")

In [None]:
#above method wa just for learning purposes and understanding, in easir and better its done as

In [None]:
from sklearn.model_selection import train_test_split
train_set , test_set = train_test_split(housing, test_size=0.2 , random_state=42)

In [None]:
print(f" no of rows in training set is : {len(train_set)}\n no. of rows in testing set is : {len(test_set)}")

In [None]:
#now theres a pretty good chance that above splitting doesnt represent the entire population for particular feature or
#all.For example CHAS has 471 values of 0 and 36s values of 1. and we do not want that training set gets only 0 and no 1.
#so, in a situation like this we will use STATIFIED SAMPLING

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1 , test_size = 0.2, random_state=42)

In [None]:
for train_index , test_index in split.split(housing , housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    

In [None]:
strat_test_set['CHAS'].value_counts()

In [None]:
strat_train_set['CHAS'].value_counts()

## Looking for correlations

In [None]:
 #making a correlation matrix

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes =["MEDV" , "ZN","RM","STAT"]
scatter_matrix(housing[attributes],figsize = (12,8))

In [None]:
housing.plot(kind = 'scatter' , x='RM', y ='MEDV', alpha=0.8)


In [None]:
import seaborn as sns

In [None]:
sns.regplot(x='RM', y ='MEDV',data=housing)

In [None]:
 housing.plot(kind = 'scatter' , x='CRIM', y ='MEDV', alpha=0.8)

 ## Trying out attribute Combinations

In [None]:
 #Cretaing anew attribute tax per room

In [None]:
housing["TAXRM"] = housing['TAX']/housing['RM']

In [None]:
housing.head()

In [None]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=True)

In [None]:
 housing.plot(kind = 'scatter' , x='TAXRM', y ='MEDV', alpha=0.8)

In [None]:
housing = strat_train_set.drop("MEDV" , axis=1)
housing_labels = strat_train_set["MEDV"].copy()

## Missing Values

In [None]:
#here puposefully i deleted few datas from RM coloumn.lets deal this.

In [None]:
#to take care of the missing data points we need to either :
      #1. delete the respective data point
      #2. delete the attribute
      #3. setting the missing values to some values(0,mean,median) 
    

In [None]:
a1 = housing.dropna(subset=["RM"])   #Option 1 #also here housing data hasnt changed since we didnt put inplace=True
a1.shape

In [None]:
housing.drop("RM",axis=1)     #option2 , note the RM coloumn is dopped,also original housing dataframe remains unchanged

In [None]:
median = housing["RM"].median()    #option3 , ,also original housing dataframe remains unchanged

In [None]:
median

In [None]:
housing["RM"].fillna(median)

In [None]:
from sklearn.impute import SimpleImputer           #this method automatically fill missing values with median of all coloumns
imputer = SimpleImputer(strategy ='most_frequent')
imputer.fit(housing)

In [None]:
imputer.statistics_               #these are the mediansof all the columns

In [None]:
X= imputer.transform(housing)     #we need t0 create a pipleline, such that every missing values gets the median value automatically

In [None]:
housing_tr = pd.DataFrame(X , columns=housing.columns)

In [None]:
housing_tr.describe()

## Scikit-learn Design 


 In Scikit learn there are primarily there are three objects:
 1. estimators : it estimates some parameter based on dataset.Eg : imputer. This has a Fit mathod and a transform method.
                 fit method fits the dataset and calculates the internal parameter
 2. transformers: transform method takes the input and returns the output based on the learnings from the fit().
                  it also has a convinience function fit_transform(), which fit and then transforms
 3. predictors: Eg. Linear Regression.fit() and predict() are two common functions.it also gives score() function which 
                evaluates the predictors.

## Feature Scaling 

Its primalrily done in two ways : 
1. min-max scaling (Normalization) : (value-min)/(max-min) lies between 0 and 1.
                                     sklearn provides a class called minmaxscaler for this
2. Standardization : (value-mean)/std . 
                         skelarn provides a class called standardscaler for this.

In [None]:
#now creating a pipeline.pipelines not only means importing fun from outside but also means that changes can be done 
#easiy whenver required. pipelines are used for "automation"

 ## Creating a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
my_pipeline = Pipeline([('imputer' , SimpleImputer(strategy= 'median')) , ('std_scaler', StandardScaler())])

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing_tr)

In [None]:
housing_num_tr

In [None]:
housing_num_tr.shape

## Selecting a desired model for XYZ company

In [None]:
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model= LinearRegression()
#model = DecisionTreeRegressor()
model= RandomForestRegressor()
model.fit(housing_num_tr , housing_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
some_labels

In [None]:
list(some_labels)

## Evaluating the model

In [None]:
from sklearn.metrics import mean_squared_error
housing_prediction = model.predict(housing_num_tr)
lin_mse = mean_squared_error(housing_labels , housing_prediction)
lin_rmse = np.sqrt(lin_mse)

In [None]:
lin_mse

In [None]:
lin_rmse

## Model Decision

since lin_mse was 23, huge error

so we will go above and convert linear regression to decision tree regressor.
      output : lin_mse came 0. Error came 0. a case of OVERFITTING
      
so now we will use a better Evaluation technique which is CROSS VALIDATIOON
      
      

## Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model , housing_num_tr , housing_labels , scoring = "neg_mean_squared_error",cv=10)
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
# here with decison tree regressor anf after cross validation rmse scores are around 3.
#so i will go above and try seeing again with linear regression and check rmse

In [None]:
#well now rmse scores are around 4and 5
#therefore decision tree regreessor seems better model after cross validating

In [None]:
def print_scores(scores):
    print("scores : ",scores)
    print("mean : ", scores.mean())
    print("standard Deviation : ", scores.std())

In [None]:
print_scores(rmse_scores)

In [None]:
#now we will also use another model Random Forest.again going back above 

## Saving the model

In [None]:
from joblib import dump,load
dump(model , 'XYZ.joblib')

## Testing the model on test data

In [None]:
X_test = strat_test_set.drop("MEDV" , axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_prediction = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test , final_prediction)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
# now if we want to predict for the given features, wwe can do:

In [None]:
input = np.array([[-0.5 , 3 , -1 , -0.2 , -1.4 , -0.2 , -1.3 , 2.6 , -1 , -0.5, -0.9 , 0.4 , -0.86]])

In [None]:
model.predict(input)

## Using the model

In [None]:
from joblib import dump,load
import numpy as np
model = load('XYZ.joblib')


input = np.array([[-0.5 , 3 , -1 , -0.2 , -1.4 , -0.4 , -1.3 , 2.6 , -1 , -0.5, -0.9 , 0.4 , -0.70]])
model.predict(input)

## Model results

## Pickling The model File :
                 “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream.Pickling is needed since the model needs to be deployed using dockers and github

In [None]:
import pickle

In [None]:
pickle.dump(model,open('model.pkl', 'wb'))

In [None]:
#pickled_model=pickle.load(open('model.pkl', 'rb')) #when pickled file needs to be loaded