In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sb
import matplotlib.pyplot as plt

housing=pd.read_csv("/kaggle/input/boston-housing-dataset/HousingData.csv")

In [None]:
#finding the null values in our dataset 
housing.isnull().sum()


In [None]:
#filling other Attributes with mean values..
print("CRIM-",housing['CRIM'].mean())
print("INDUS--",housing['INDUS'].mean())
print("ZN--",housing['ZN'].mean())
print("Age",housing['AGE'].median())
print("LSTAT",housing['LSTAT'].median())

In [None]:
#defining and calling a function for dealing with null values
def filling_null_values():
    housing['ZN'].fillna(11.3,inplace=True)
    housing['CRIM'].fillna(3.56006,inplace=True)
    housing['INDUS'].fillna(10.98,inplace=True)
    housing['AGE'].fillna(76.7,inplace=True)
    housing['LSTAT'].fillna(11.32,inplace=True)
    
filling_null_values()

In [None]:
#filling null values or dropping in case
# dropping chas null values as it is not relatable with other columns and contain only two values

housing=housing.dropna(subset=['CHAS'],axis=0)
#checking for null values
np.any(np.isnan(housing))


In [None]:
print(housing.head(100))
housing.info()

In [None]:
housing['CHAS'].value_counts()
housing.describe()

In [None]:
##%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))

In [None]:
#finding Corelation and plotting a heat map
corr=housing.corr()
corr

In [None]:
plt.figure(figsize=(10,10))
sb.heatmap(corr,annot=True)

In [None]:
#defining a function for splitting the dataset
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffeled=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices=shuffeled[:test_set_size]
    train_indices=shuffeled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [None]:
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)

In [None]:
print(f"rows in train set:{len(train_set)}\n rows in test set:{len(test_set)}\n")

In [None]:
#splitting the dataset using stratified shuffle split
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index, test_index in split.split(housing,housing['CHAS']):
    strat_train_set=housing.iloc[train_index]
    strat_test_set=housing.iloc[test_index]
    housing=strat_train_set.copy()

In [None]:
#Finding correlation with the MEDV values
corr_matrix=housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
#plotting some plots to see relation trends between attributes
from pandas.plotting import scatter_matrix
attributes=["MEDV","RM","ZN","LSTAT"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
# trying out some new attributes TAXRM=TAX/RM
housing["TAXRM"]=housing['TAX']/housing['RM']
#plotting some plots to see relation trends between NEW attributes
print(housing.head())
corr_matrix=housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
# plotting another scatter plot to se relation with our new attributes
attributes=["MEDV","RM","ZN","TAXRM","LSTAT"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
housing=strat_train_set.drop("MEDV",axis=1)
housing_labels=strat_train_set["MEDV"].copy()

In [None]:
# making a pipeline 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
my_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
     ('std_scaler', StandardScaler()),
])

In [None]:
housing_num_tr=my_pipeline.fit_transform(housing)
housing_num_tr.shape

In [None]:
# defining some functions

In [None]:
def pred_some_values():
    some_data=housing.iloc[:5]
    some_labels=housing_labels.iloc[:5]
    prepared_data=my_pipeline.transform(some_data)
    model.predict(prepared_data)
    list(some_labels)

In [None]:
def model_evaluation_on_train_set():
    #Evaluating the model
    global rmse_scores
    global mean_squared_error
    from sklearn.metrics import mean_squared_error
    import numpy as np
    housing_predictions=model.predict(housing_num_tr)
    mse=mean_squared_error(housing_labels,housing_predictions)
    rmse=np.sqrt(mse)
    
    
    
    
    #using better evaluation technique cross-validation
    from sklearn.model_selection import cross_val_score
    score=cross_val_score(model,housing_num_tr,housing_labels,scoring="neg_mean_squared_error",cv=10)
    rmse_scores=np.sqrt(-score)

In [None]:
def print_scores(scores):
    print("scores :",scores)
    print(" ")
    print("MEAN :",scores.mean())
    print(" ")
    print("STANDARD DEVIATION :",scores.std())
    print(" ")

In [None]:
def model_prediction_test_set():
    #TESTING OUR DATA ON MODELS
    global final_predictions
    global x_test
    global y_test
    x_test=strat_test_set.drop("MEDV",axis=1)
    y_test=strat_test_set["MEDV"].copy()

    x_test_prepared=my_pipeline.transform(x_test)
    final_predictions=model.predict(x_test_prepared)
    final_mse=mean_squared_error(y_test,final_predictions)
    final_rmse=np.sqrt(final_mse)
    final_rmse

In [None]:
def show_result():
    pred_values=np.array(final_predictions)
    values=np.array(y_test)
    print(np.concatenate((values.reshape(len(values),1), pred_values.reshape(len(pred_values),1)),1))

In [None]:
def model_selection():
    global model
    # SELECTING A DESIRED MODEL FOR REAL ESTATES
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    
     #                 ****    calling functions for decision tree model      ****
    
    print("              ****    current model is DECISION TREE MODEL   ****" )
    model=DecisionTreeRegressor()
    model.fit(housing_num_tr,housing_labels)
    # for predicting some values checking model is predicting 
    
    # evaluating loss for test set
    model_evaluation_on_train_set()
    # printing rmse error scores
    print(" ")
    print(" loss function scores for RMSE ERROR")
    print(" ")
    print_scores(rmse_scores)
    # model prediction on test set
    model_prediction_test_set()
    # showing results
    print(" ")
    print(" Comparision between orignial values and predicted values")
    print(" ")
    show_result()
    print(" ")
    print(" ")
    print(" ")
    
    #                    ****    calling functions for linear regression     ****
    print("       ****   current model is LINEAR REGRESSION MODEL  ****")
    model=LinearRegression()
    model.fit(housing_num_tr,housing_labels)
    # for predicting some values checking model is predicting 
    pred_some_values()
    # evaluating loss for test set
    model_evaluation_on_train_set()
    # printing rmse error scores
    print(" ")
    print(" loss function scores for RMSE ERROR")
    print(" ")
    print_scores(rmse_scores)
    # model prediction on test set
    model_prediction_test_set()
    # showing results 
    print(" ")
    print(" Comparision between orignial values and predicted values")
    print(" ")
    show_result()
    print(" ")
    print(" ")
    print(" ")
    
    #                     *****  calling functions for Random Forest    ******
    print("        **** current model is RANDOM FOREST MODEL ****")
    model=RandomForestRegressor()
    model.fit(housing_num_tr,housing_labels)
    # for predicting some values checking model is predicting 
    pred_some_values()
    # evaluating loss for test set
    model_evaluation_on_train_set()
    # printing rmse error scores
    print(" ")
    print(" loss function scores for RMSE ERROR ")
    print(" ")
    print_scores(rmse_scores)
    # model prediction on test set
    model_prediction_test_set()
    # showing results
    print(" ")
    print(" Comparision between orignial values and predicted values")
    print(" ")
    show_result()
    print(" ")
    print(" ")
    print(" ")
    
    



In [None]:
model_selection()