In [2]:
pip install dash

Collecting dashNote: you may need to restart the kernel to use updated packages.

  Downloading dash-2.9.2-py3-none-any.whl (10.2 MB)
     --------------------------------------- 10.2/10.2 MB 94.6 kB/s eta 0:00:00
Collecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, dash
Successfully installed dash-2.9.2 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0


In [4]:
pip install jupyter_dash

Collecting jupyter_dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting ansi2html
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, ansi2html, jupyter_dash
Successfully installed ansi2html-1.8.0 jupyter_dash-0.4.2 retrying-1.3.4
Note: you may need to restart the kernel to use updated packages.


In [55]:
import pandas as pd
import plotly.express as px
from dash import Input, Output, dcc, html
from jupyter_dash import JupyterDash
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

JupyterDash.infer_jupyter_proxy_config()

n the last lesson, we built a model based on the highest-variance features in our dataset and created several visualizations to communicate our results. In this lesson, we're going to combine all of these elements into a dynamic web application that will allow users to choose their own features, build a model, and evaluate its performance through a graphic user interface. In other words, you'll create a tool that will allow anyone to build a model without code.

# Prepare Data

In [56]:
def wrangle(filepath):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful households whose net worth is less than $2 million.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    # Read CSV file
    df = pd.read_csv(filepath)
    
     # Remove NAN Value
    df.dropna(inplace=True)
    
     # Drop features with high null counts
    df.drop(columns=["ocean_proximity"], inplace=True)
    
    return df

In [57]:
wrangle?

In [58]:
df = wrangle(r"C:\Users\sanus\Desktop\DS\web\housing.csv")
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
dtypes: float64(9)
memory usage: 1.6 MB
None


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


# Application Layout

In [74]:
# Instantiate a JupyterDash application and assign it to the variable name app
app = JupyterDash(__name__)

In [89]:
app.layout = html.Div(
    [
        # Application title
        html.H1("Survey of House price"),
        # Bar chart element
        html.H2("High Variance Features"),
        # Bar chart graph
        dcc.Graph(id="bar-chart"),
        dcc.RadioItems(
            options=[
                {"label": "trimmed", "value": True},
                {"label": "not trimmed", "value": False}
            ],
            value=True,
            id="trim-button"
        ),
        html.H2("K-means Clustering"),
        html.H3("Number of Clusters (k)"),
        dcc.Slider(min=2, max=12, step=1, value=2, id="k-slider"),
        html.Div(id="metrics"),
        # PCA scatter plot
        dcc.Graph(id="pca-scatter")
    ]
)

Create a get_high_var_features function that returns the five highest-variance features in a DataFrame. Use the docstring for guidance

In [75]:
def get_high_var_features(trimmed=True, return_feat_names=True):

    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    # calculate variance
    if trimmed:
        top_ten_var_features= (
            df.apply(trimmed_var).sort_values().tail(10)
        )
    else:
        top_ten_var_features = df.var().sort_values().tail(10)
    # Extract names
    if return_feat_names:
        top_ten_var_features = top_ten_var_features.index.tolist()
    
    return  top_ten_var_features

In [76]:
get_high_var_features(trimmed=True, return_feat_names=False)

median_income         1.191597e+00
longitude             2.887484e+00
latitude              3.041485e+00
housing_median_age    8.272390e+01
households            2.873237e+04
total_bedrooms        3.410124e+04
population            2.422167e+05
total_rooms           7.997655e+05
median_house_value    5.550559e+09
dtype: float64

Create a serve_bar_chart function that returns a plotly express bar chart of the five highest-variance features. You should use get_high_var_features as a helper function. Follow the docstring for guidance.

In [77]:
@app.callback(
    Output("bar-chart", "figure"), Input("trim-button", "value")
)
def serve_bar_chart(trimmed=True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    # Get features
    top_ten_features = get_high_var_features(trimmed=trimmed, return_feat_names=False)
    
    # Build bar chart
    fig = px.bar(x=top_ten_features, y=top_ten_features.index, orientation="h")
    fig.update_layout(xaxis_title="Variance", yaxis_title="Features")
    
    return fig

Create a get_model_metrics function that builds, trains, and evaluates KMeans model. Use the docstring for guidance. Note that, like the model you made in the last lesson, your model here should be a pipeline that includes a StandardScaler. Once you're done, submit your function to the grader.

In [80]:
def get_model_metrics(trimmed=True, k=2, return_metrics=False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    # Get high var features
    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    # Create feature matrix
    X = df[features]
    # Build model
    model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42))
    model.fit(X)
    if return_metrics:
        # calculate inertia
        i = model.named_steps["kmeans"].inertia_
        # calculate silhouette score
        ss = silhouette_score(X, model.named_steps["kmeans"].labels_)
        # put results into dictionary
        metrics = {
            "inertia": round(i),
            "silhouette": round(ss, 3)
        }
        # return dictionary to user
        return metrics
    
    return model

In [81]:
get_model_metrics(trimmed=True, k=20, return_metrics=False)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=20, random_state=42))])

Part of what we want people to be able to do with the dashboard is see how the model's inertia and silhouette score when they move the slider around, so let's calculate those numbers...

In [82]:
@app.callback(
    Output("metrics", "children"),
    Input("trim-button", "value"),
    Input("k-slider", "value")
    
)
def serve_metrics(trimmed=True, k=2):

    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Get metrics
    metrics = get_model_metrics(trimmed=trimmed, k=k, return_metrics=True)
    
    # Add metrics to HTML elements
    text = [
        html.H3(f"Inertia: {metrics['inertia']}"),
        html.H3(f"Silhouette Score: {metrics['silhouette']}")
    ]
    
    return text

In [65]:
 serve_metrics(k=20)

[H3('Inertia: 40669'), H3('Silhouette Score: -0.249')]

# PCA Scatter Plot

We just made a slider that can change the inertia and silhouette scores, but not everyone will be able to understand what those changing numbers mean. Let's make a scatter plot to help them along.Add a Graph object to your application's layout. Be sure to give it the id "pca-scatter".

Just like with the bar chart, we need to get the five highest-variance features of the data, so let's start with that.

Create a function get_pca_labels that subsets a DataFrame to its five highest-variance features, reduces those features to two dimensions using PCA, and returns a new DataFrame with three columns: "PC1", "PC2", and "labels". This last column should be the labels determined by a KMeans model. Your function should you get_high_var_features and get_model_metrics as helpers. Refer to the docstring for guidance.

In [85]:
def get_pca_labels(trimmed=True, k=2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Create feature matrix
    features = get_high_var_features(trimmed=trimmed, return_feat_names=True)
    X = df[features]

    # Build transformer
    transformer = PCA(n_components=2, random_state=42)

    # Transform data
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])
    
    # Add labels
    model = get_model_metrics(trimmed=trimmed, k=k, return_metrics=False)
    X_pca["labels"] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace=True)
    
    return X_pca

In [86]:
get_pca_labels(trimmed=True, k=2)

Unnamed: 0,PC1,PC2,labels
0,245730.472035,-2666.830649,0
13266,-88065.269798,-380.861344,0
13265,-86464.821651,109.959228,0
13264,-106068.780241,-1812.682069,0
13248,108130.539051,-2612.134732,0
...,...,...,...
15055,101465.804300,13386.697300,1
15057,87243.495908,2982.457736,1
15059,74744.512626,3547.153834,1
15061,31842.987579,3193.338478,1


Create a function serve_scatter_plot that creates a 2D scatter plot of the data used to train a KMeans model, along with color-coded clusters. Use get_pca_labels as a helper. Refer to the docstring for guidance.

In [87]:
@app.callback(
    Output("pca-scatter", "figure"),
    Input("trim-button", "value"),
    Input("k-slider", "value")    
)
def serve_scatter_plot(trimmed=True, k=2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    fig = px.scatter(
        data_frame=get_pca_labels(trimmed=trimmed, k=k), 
        x="PC1", 
        y="PC2", 
        color="labels", 
        title="PCA Representation of Cluster"
    )
    fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
    
    return fig

Start building the layout of your app by creating a Div object that has two child objects: an H1 header that reads "Survey of Consumer Finances" and an H2 header that reads "High Variance Features"

In [90]:
app.run_server(mode="inline", host = "localhost")

Dash is running on http://localhost:8050/

