In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Below is a list of various variables which we have in our dataset

* crim - per capita crime rate by town
* zn - proportion of residential land zoned for lots over 25,000 sq.ft
* indus - proportion of non-retail business acres per town
* chas - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* nox - nitric oxides concentration (parts per 10 million)
* rm - average number of rooms per dwelling
* age - proportion of owner-occupied units built prior to 1940
* dis - weighted distances to five Boston employment centres
* rad - index of accessibility to radial highways
* tax - full-value property-tax rate per USD 10,000
* ptratio - pupil-teacher ratio by town
* black - proportion of blacks by town
* lstat - percentage of lower status of the population
* medv - median value of owner-occupied homes in USD 1000’s

We would like to know the correlation between vairables to decide the featues for our model.If the value is close to 1, it means that there is a strong positive correlation between the two variables. When it is close to -1, the variables have a strong negative correlation.

In [None]:
df_train=pd.read_csv('/kaggle/input/boston-housepredict/boston_train.csv')

In [None]:
# Correlation
plt.subplots(figsize=(20,15))
correlation_matrix = df_train.corr().round(2)
sns_plot=sns.heatmap(data=correlation_matrix, annot=True)

Observations:
We found that rm variable have strong correlation with our target medv which is (0.69). On the other hand, lstat have high negative correlation with 'rm' which is (-0.74).

* 'RM' is positively correlated: A higher 'RM' usually indicates more space which could be more expensive.
* 'LSTAT' is negatively correlated: People from lower class usually can't afford high housing price so when 'LSTAT' is high, the housing price is more likely to be cheap.
* 'PTRATIO' is negatively correlated: A lower 'LSTAT' means that teacher could pay more attention to each of students which indicates a better education, so that would be more expensive.

In [None]:
df_train.head()

In [None]:
prices=df_train['medv']
features=df_train.drop(['medv'],axis=1)

In [None]:
print("BASIC STATS FOR OUR THE BOSTON HOUSING DATASET  \n")
MAX_PRICE=np.max(prices)
MIN_PRICE=np.min(prices)
MEAN_PRICE=np.mean(prices)
MEDIAN_PRICE=np.median(prices)
STD_PRICE=np.std(prices)
print("Max Price in USD 1000’s = ${:,.2f}".format(MAX_PRICE))
print("Min Price in USD 1000’s = ${:,.2f}".format(MIN_PRICE))
print("Mean Price in USD 1000’s = ${:,.2f}".format(MEAN_PRICE))
print("Median Price in USD 1000’s = ${:,.2f}".format(MEDIAN_PRICE))
print("Standard Dev Price in USD 1000’s = ${:,.2f}".format(STD_PRICE))

**Performance Marix**

The values for R2 range from 0 to 1, which captures the percentage of squared correlation between the predicted and actual values of the target variable. A model with an R2 of 0 is no better than a model that always predicts the mean of the target variable, whereas a model with an R2 of 1 perfectly predicts the target variable. Any value between 0 and 1 indicates what percentage of the target variable, using this model, can be explained by the features

In [None]:
from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

In [None]:
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state=0)

# Success
print ("Training and testing split was successful.")

**Training and Testing**

In [None]:
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import ShuffleSplit

def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
   
    cv_sets = ShuffleSplit(X.shape[0],  test_size = 0.20, random_state = 0)

    # Create a decision tree regressor object
    
    regressor = DecisionTreeRegressor()

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':range(1,11)}

    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Make sure to include the right parameters in the object:
    # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
    grid = GridSearchCV(regressor,params,scoring=scoring_fnc,cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [None]:
reg = fit_model(X_train, y_train)

#Print the value for 'max_depth'
print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))


In [None]:
reg = fit_model(X_train, y_train)
pred = reg.predict(X_test)
score = performance_metric(y_test,pred)
print("R Squared Value: " + str(score))