In [1]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.utils import check_array
import numpy as np
from numba import jit, vectorize, float64, int64
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.externals import joblib
from math import sqrt
from IPython import display
import dash
import dash_core_components as dcc
import dash_html_components as html
import warnings
import plotly.graph_objs as go
from plotly import tools
from plotly.figure_factory import create_2d_density
from plotly.graph_objs import graph_objs

In [2]:
warnings.filterwarnings('ignore')

In [3]:
rf = joblib.load('./data/rf_trained.pkl')

In [4]:
# Data Loading
data = pd.read_csv('./data/TrainAndValid.csv')
test = pd.read_csv('./data/Test.csv')

# Date Features
tmp_date = data.saledate.apply(lambda x : datetime.strptime(x[:-5], '%m/%d/%Y'))
data['sale_mon'] = tmp_date.dt.month
data['sale_dayofweek'] = tmp_date.dt.dayofweek
data['sale_dayofyear'] = tmp_date.dt.dayofyear
data['sale_year'] = tmp_date.dt.year
data.drop(['saledate'],axis=1,inplace=True)

# Taking Subset of Columns
kept_columns = [
                'YearMade', 
                'sale_mon', 
                'sale_dayofweek',
                'sale_dayofyear',
                'sale_year',
                'fiModelDesc',
                'fiBaseModel',
                'fiProductClassDesc',
                'state',
                'SalePrice'
               ]
data = data[kept_columns]
data['age'] = data.sale_year - data.YearMade

# Encoding Class Description
data.loc[:,'classDesc_1'] = data.fiProductClassDesc.apply(lambda x : x.replace(',','').strip().split('-')[0])
data.loc[:,'classDesc_2'] = data.fiProductClassDesc.apply(lambda x : x.replace(',','').strip().split('-')[1])
data.drop('fiProductClassDesc',axis=1,inplace=True)

for col in ['fiModelDesc','fiBaseModel','state','classDesc_1','classDesc_2']:
    lb = LabelEncoder()
    data.loc[:,col] = lb.fit_transform(data.loc[:,col])

# Building a Random Forest Regressor
data.loc[data.YearMade < 1920, 'YearMade'] = np.median(data.YearMade)
data.loc[data.YearMade > 2012, 'YearMade'] = 2012

train, valid = train_test_split(data, test_size = .2)
y_train = train.SalePrice
y_valid = valid.SalePrice
train.drop('SalePrice',axis=1,inplace=True)
valid.drop('SalePrice',axis=1,inplace=True)


# Regression metrics

## MAE, MSE, MSPE, MAPE, MSLE, R2 

In [5]:
def mean_absolute_percentage_error(y_true, y_pred): 
    #y_true, y_pred = check_array(y_true, y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
def mean_squared_percentage_error(y_true, y_pred): 
    #y_true, y_pred = check_array(y_true, y_pred)
    return np.mean((y_true - y_pred)**2 / y_true) * 100

In [7]:
def get_regression_metrics(rf, train, y_train, valid, y_valid):
    #rf.fit(train, y_train)
    y_pred = rf.predict(valid)
    
    validation = {}
    validation["msle"] = mean_squared_log_error(y_valid, y_pred)
    validation["mae"] = mean_absolute_error(y_valid, y_pred)
    validation["mse"] = mean_squared_error(y_valid, y_pred)
    validation["mape"] = mean_absolute_percentage_error(y_valid, y_pred)
    validation["r2"] = r2_score(y_valid, y_pred)
    validation["rmsle"] = np.sqrt(mean_squared_log_error(y_valid, y_pred))
    validation["rmse"] = np.sqrt(mean_squared_error(y_valid, y_pred))
    validation["preds"] = y_pred
    #validation["rmspe"] = mean_squared_percentage_error(y_valid, y_pred)
    
    y_pred_train = rf.predict(train)
    training = {}
    training["msle"] = mean_squared_log_error(y_train, y_pred_train)
    training["mae"] = mean_absolute_error(y_train, y_pred_train)
    training["mse"] = mean_squared_error(y_train, y_pred_train)
    training["mape"] = mean_absolute_percentage_error(y_train, y_pred_train)
    training["r2"] = r2_score(y_train, y_pred_train)
    training["rmsle"] = np.sqrt(mean_squared_log_error(y_train, y_pred_train))
    training["rmse"] = np.sqrt(mean_squared_error(y_train, y_pred_train))
    training["preds"] = y_pred_train
    
    return validation, training

In [8]:
valid_metrics, train_metrics = get_regression_metrics(rf, train, y_train, valid, y_valid)

# show the Dash plot

In [9]:
def show_app(app,  # type: dash.Dash
             port=9999,
             width=700,
             height=350,
             offline=True,
             style=True,
             **dash_flask_kwargs):
    """
    Run the application inside a Jupyter notebook and show an iframe with it
    :param app:
    :param port:
    :param width:
    :param height:
    :param offline:
    :return:
    """
    url = 'http://localhost:%d' % port
    iframe = '<iframe src="{url}" width={width} height={height}></iframe>'.format(url=url,
                                                                                  width=width,
                                                                                  height=height)
    display.display_html(iframe, raw=True)
    if offline:
        app.css.config.serve_locally = True
        app.scripts.config.serve_locally = True
    if style:
        external_css = ["https://fonts.googleapis.com/css?family=Raleway:400,300,600",
                        "https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css",
                        "http://getbootstrap.com/dist/css/bootstrap.min.css", ]

        for css in external_css:
            app.css.append_css({"external_url": css})

        external_js = ["https://code.jquery.com/jquery-3.2.1.min.js",
                       "https://cdn.rawgit.com/plotly/dash-app-stylesheets/a3401de132a6d0b652ba11548736b1d1e80aa10d/dash-goldman-sachs-report-js.js",
                       "http://getbootstrap.com/dist/js/bootstrap.min.js"]

        for js in external_js:
            app.scripts.append_script({"external_url": js})

    return app.run_server(debug=False,  # needs to be false in Jupyter
                          port=port)
                          #**dash_flask_kwargs)

In [10]:
columns = ["type", "mae", "mape", "mse", "msle", "rmse", "rmsle", "r2"]

In [11]:
df_test = pd.DataFrame(valid_metrics).head(1).drop(["preds"], axis = 1).apply(lambda x:round(x, 4))
df_test["type"] = "validation"
df_test = df_test[columns] 

In [12]:
df_train = pd.DataFrame(train_metrics).head(1).drop(["preds"], axis = 1).apply(lambda x:round(x, 4))
df_train["type"] = "training"
df_train = df_train[columns]

In [13]:
app_simple = dash.Dash()

In [14]:
colorscale =[[0.0, 'rgb(50,50,50)'], [0.001, 'rgb(5, 57, 94)'],[1.0,'rgb(126, 247, 27)']]#[1.0, 'rgb(242, 59, 31)']]

In [15]:
subsample=1_000

In [119]:
total_rows = len(train)

In [120]:
color_bar_nums = [30,25, 20,15, 10, 5]
color_bar_list = [str(round((total_rows*x)/float(subsample)*100,1))+"%" for x in color_bar_nums]

In [123]:
color_bar_nums = [30, 5]
color_bar_list = ["max", "min"]

In [124]:
trace_hist = graph_objs.Histogram2dcontour(x = valid_metrics["preds"][:subsample], 
                                           y= y_valid[:subsample], 
                                           name= "Validation set",
                                           ncontours=100,
                                           nbinsx=100,
                                           nbinsy=100, 
                                           autocontour= False,
                                           contours = {"showlines":False},
                                           yaxis='y2', 
                                           colorscale=colorscale,
                                           colorbar={"yanchor":"top", 
                                                     "len":.5,
                                                    'ticktext':color_bar_list,
                                                     'tickvals':color_bar_nums,
                                                    'tickmode':'array'}
                                                    #'tickfont':{'color':'white'}}
                                          )

trace_hist2 = graph_objs.Histogram2dcontour(x = valid_metrics["preds"][:subsample], 
                                            y= y_valid[:subsample], 
                                            name= "Validation set", 
                                            xaxis='x2',
                                            yaxis='y2',
                                            ncontours=100,
                                            nbinsx=100,
                                            nbinsy=100, 
                                            autocontour= False,
                                            contours = {"showlines":False},
                                            colorscale=colorscale,
                                            colorbar={"yanchor":"top", 
                                                     "len":.5,
                                                     'ticktext':color_bar_list,
                                                     'tickvals':color_bar_nums,
                                                     'tickmode':'array'})
                                                     #'tickfont':{'color':'white'}})

trace = go.Scatter(x = valid_metrics["preds"][:subsample], 
                   y= y_valid[:subsample], 
                   mode='markers',
                   marker = {"opacity":0.7, 
                             "color":'rgb(22, 199, 229)'},
                   xaxis='x2', 
                   textfont=dict(family='helvetica', 
                                 size=14, 
                                 color='rgb(193, 192, 191)'),
                   showlegend=True, 
                   name = "Validation")

trace2 = go.Scatter(x = train_metrics["preds"][:subsample], y= y_train[:subsample], 
                          mode='markers',
                          marker = {"opacity":0.7, 
                                    "color":'rgb(2, 150, 255)'},
                          showlegend=True, 
                          name = "Training", 
                   )

In [128]:
annot_font = {"family":'helvetica', "size":20,"color":"white"}#'rgb(2, 150, 255)'}

In [129]:
annotations=[
    dict(x=0.19,
         y=1.04,
         xref='paper',
         yref='paper',
         text="Validation", 
         showarrow=False, 
         font= annot_font
        ), 
    dict(x=0.82, 
         y=1.04,
         xref='paper',
         yref='paper',
         text="Training", 
         showarrow=False,
         font=annot_font
        ),
    dict(x=0.14,
         y=.42,
         xref='paper',
         yref='paper',
         text="Density of Validation", 
         showarrow=False,
         font=annot_font
        ), 
    dict(x=0.87, 
         y=.42,
         xref='paper',
         yref='paper',
         text="Density of Training", 
         showarrow=False,
         font=annot_font
        ),
    dict(x = 0, 
         y=-.1, 
         xref = 'paper', 
         yref = 'paper', 
         text = 'Predicted')
]

In [130]:
layout = go.Layout(title='Predicted vs. True Values', titlefont={'color':"white"},
                   font=dict(family='helvetica', 
                             size=14, 
                             color='rgb(193, 192, 191)'),
                   xaxis={'title': 'Predicted', 
                          "domain":[0, 0.45]},
                   yaxis={'title': 'True', 
                          "domain":[0.6,1]},
                   xaxis2={'title': 'Predicted',
                           "domain":[0.55, 1]},
                   yaxis2={"title":"True", 
                           'domain':[0,.4]},
                   paper_bgcolor = "rgb(20,20,20)",
                   plot_bgcolor="rgb(50,50,50)",
                   autosize=False,
                   width=900,
                   height=900,
                   annotations=annotations)

In [131]:
app_simple.layout = html.Div(children=[
    html.H2(children='Beta Release'),


    html.Table(
               [html.Tr([html.Th(col) for col in df_test.columns])] +
               [html.Tr( [html.Td(df_test.iloc[i][col]) for col in df_test.columns] ) 
                                for i in range(min(len(df_test), 10))]+
               [html.Tr( [html.Td(df_train.iloc[i][col]) for col in df_train.columns] ) 
                                for i in range(min(len(df_train), 10))]
               ,
              
            
        
        
               style={'border': '4px solid',
                      'text-align': 'center',
                      'width':'900',
                      "background-color": "#141414",
                      "color": "#c1c0bf",
                      'border-color': "#c1c0bf"
                      
                     }
              ),
    dcc.Graph(
        id='example-graph',
        figure={
            'data': [trace_hist, trace_hist2, trace, trace2],
            
            'layout': layout
        }
    )
])

In [None]:
show_app(app_simple,  # type: dash.Dash
             port=9999,
             width=1000,
             height=1000,
             offline=True,
             style=True)

 * Running on http://127.0.0.1:9999/ (Press CTRL+C to quit)
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET /_dash-component-suites/dash_core_components/rc-slider@6.1.2.css?v=0.22.1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET /_dash-component-suites/dash_core_components/react-select@1.0.0-rc.3.min.css?v=0.22.1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET /_dash-component-suites/dash_core_components/react-virtualized@9.9.0.css?v=0.22.1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET /_dash-component-suites/dash_core_components/react-virtualized-select@3.1.0.css?v=0.22.1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET /_dash-component-suites/dash_core_components/react-dates@12.3.0.css?v=0.22.1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Apr/2018 19:51:34] "[37mGET /_dash-component-suites/dash_renderer/react@15.4.2.min.js?v=0.12.1 HTTP/1.1[0m" 