In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import seaborn as sns
from scipy import stats
from pandas.plotting import scatter_matrix
from sklearn.model_selection import StratifiedKFold, KFold
# from pandas.tools.plotting import scatter_matrix 

# Import supplementary visualizations code visuals.py
from shutil import copyfile
# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/visualspy/visuals.py", dst = "../working/visuals.py") 
import visuals as vs

# import visuals as vs
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/california-housing-prices/housing.csv' , sep = ',' ,encoding = 'utf8') 

In [None]:
df.describe()

## Classes to Regression 

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
pd.value_counts(df['ocean_proximity']).plot.bar()

In [None]:
# classses to regression 
df['ocean_proximity'] = df['ocean_proximity'].replace( '<1H OCEAN', 1 )
df['ocean_proximity'] = df['ocean_proximity'].replace( 'INLAND', 2 )
df['ocean_proximity'] = df['ocean_proximity'].replace( 'NEAR OCEAN', 3 )
df['ocean_proximity'] = df['ocean_proximity'].replace( 'NEAR BAY', 4 )
df['ocean_proximity'] = df['ocean_proximity'].replace( 'ISLAND', 5 )

In [None]:
pd.value_counts(df['ocean_proximity']).plot.bar()
df.info()

## Cleaning data from Null

In [None]:
df['longitude'] = df['longitude'].fillna( df['longitude'].mean() )
df['latitude'] = df['latitude'].fillna( df['latitude'].mean() )
df['housing_median_age'] = df['housing_median_age'].fillna( df['housing_median_age'].mean() )
df['total_rooms'] = df['total_rooms'].fillna( df['total_rooms'].mean() )
df['total_bedrooms'] = df['total_bedrooms'].fillna( df['total_bedrooms'].mean() )
df['population'] = df['population'].fillna( df['population'].mean() )
df['households'] = df['households'].fillna( df['households'].mean() )
df['median_income'] = df['median_income'].fillna( df['median_income'].mean() )
df['median_house_value'] = df['median_house_value'].fillna( df['median_house_value'].mean() )

df['ocean_proximity']= df['ocean_proximity'].astype("float64")
df['ocean_proximity'] = df['ocean_proximity'].fillna( df['ocean_proximity'].mean() )



df.info()

In [None]:
prices = df['median_house_value']
features = df.drop('median_house_value', axis = 1)
    
# Success
print ("California housing dataset has {} data points with {} variables each.".format(*df.shape))

# Developing a Model

## Implementation: Define a Performance Metric

It is difficult to measure the quality of a given model without quantifying its performance over training and testing. This is typically done using some type of performance metric, whether it is through calculating some type of error, the goodness of fit, or some other useful measurement. For this project, you will be calculating the coefficient of determination, R2, to quantify your model's performance. The coefficient of determination for a model is a useful statistic in regression analysis, as it often describes how "good" that model is at making predictions.

The values for R2 range from 0 to 1, which captures the percentage of squared correlation between the predicted and actual values of the target variable. A model with an R2 of 0 is no better than a model that always predicts the mean of the target variable, whereas a model with an R2 of 1 perfectly predicts the target variable. Any value between 0 and 1 indicates what percentage of the target variable, using this model, can be explained by the features. A model can be given a negative R2 as well, which indicates that the model is arbitrarily worse than one that always predicts the mean of the target variable

In [None]:
# TODO: Import 'r2_score'
from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true,y_predict)
    
    # Return the score
    return score

In [None]:
# Calculate the performance of this model
score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print ("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

## Implementation: Shuffle and Split Data

In [None]:
# TODO: Import 'train_test_split'
from sklearn.model_selection import train_test_split
# TODO: Shuffle and split the data into training and testing subsets
X = features
y = prices
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=45)
print(X_train.shape[0],"\n",X_test.shape[0])
# Success
print ("Training and testing split was successful.")

# Analyzing Model Performance

In this third section of the project, you'll take a look at several models' learning and testing performances on various subsets of training data. Additionally, you'll investigate one particular algorithm with an increasing 'max_depth' parameter on the full training set to observe how model complexity affects performance. Graphing your model's performance based on varying criteria can be beneficial in the analysis process, such as visualizing behavior that may not have been apparent from the results alone.

## Learning Curves

The following code cell produces four graphs for a decision tree model with different maximum depths. Each graph visualizes the learning curves of the model for both training and testing as the size of the training set is increased. Note that the shaded region of a learning curve denotes the uncertainty of that curve (measured as the standard deviation). The model is scored on both the training and testing sets using R2, the coefficient of determination.

In [None]:
vs.ModelLearning(features, prices)

## Complexity Curves

The following code cell produces a graph for a decision tree model that has been trained and validated on the training data using different maximum depths. The graph produces two complexity curves — one for training and one for validation. Similar to the learning curves, the shaded regions of both the complexity curves denote the uncertainty in those curves, and the model is scored on both the training and validation sets using the performance_metric function.

In [None]:
vs.ModelComplexity(X_train, y_train)

# Evaluating Model Performance

## Implementation: Fitting a Model (Grid Search and Cross-Validation)

In [None]:

# TODO: Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
    cv_sets = ShuffleSplit(X.shape[0],n_splits=10, test_size = 0.20, random_state = 0)

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor(random_state =75)

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':list(range(1,11))}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search cv object --> GridSearchCV()
    # Make sure to include the right parameters in the object:
    # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
    grid = GridSearchCV(regressor, params, scoring=scoring_fnc, cv = cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

## Optimal Model

In [None]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

# Produce the value for 'max_depth'
print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))

##  Predicting Selling Prices

In [None]:
# Produce a matrix for client data
client_data = [[-120, 40, 18,20,5000,1750,409,4.5,4], # Client 1
               [-100, 33, 40,2000,4000,2000,500,7,3], # Client 2
               [-114, 37, 30,3148,6000,1500,300,3,2]]  # Client 3

# Show predictions
for i, price in enumerate(reg.predict(client_data)):
    print ("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

In [None]:
features.describe()

In [None]:
prices.describe()

In [None]:
plt.figure(figsize=(20, 5))
y_ax = [[3,9],[0,40],[11,23]]
for i, col in enumerate(features.columns):
    plt.subplot(1, 3, i+1)
    plt.boxplot(df[col])
    plt.title(col)
    for j in range(3):
        plt.plot(1, client_data[j][i], marker="o")
        plt.annotate('Client '+str(j+1), xy=(1,client_data[j][i]))
        plt.ylim(y_ax[i])

# Sensitivity

An optimal model is not necessarily a robust model. Sometimes, a model is either too complex or too simple to sufficiently generalize to new data. Sometimes, a model could use a learning algorithm that is not appropriate for the structure of the data given. Other times, the data itself could be too noisy or contain too few samples to allow a model to adequately capture the target variable — i.e., the model is underfitted.

Run the code cell below to run the ``fit_model`` function ten times with different training and testing sets to see how the prediction for a specific client changes with respect to the data it's trained on.

In [None]:
vs.PredictTrials(features, prices, fit_model, client_data)