# Interactive Visualization Assignment (Plotly + IPyWidgets)

In [20]:
import pandas as pd
import plotly.express as px
from ipywidgets import interact
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MaxAbsScaler

The following makes your colab session act more like a jupyter notebook for displaying plotly and ipywidgets objects correctly. If you are using jupyter, comment it out. (It needs to be called at the top of every cell that outputs a plot that uses widgets.)

```
configure_plotly_browser_state()
```

From [this StackOverflow answer](https://stackoverflow.com/a/47230966).

In [21]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=False)

### Import the housing.csv data set.

In [60]:
df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/housing.csv')

### Separate out all the numeric fields into their own data set.

Remove the Id field as it should be categorical and the GarageYrBlt and LotFrontage fields that contains many nulls. Drop any remaining nulls, but only for fields that are numeric. Hint: use subset. Then create a new dataframe with only the numeric fields in it.

In [61]:
dfnum = df.select_dtypes(exclude= 'object').drop(['Id', 'GarageYrBlt', 'LotFrontage'], axis=1)
dfnum = dfnum.dropna()

X = dfnum.drop('SalePrice', axis = 1)
y = dfnum['SalePrice']

### Normalize the numeric data using Scikit-Learn's MaxAbsScaler.

In [62]:
from sklearn.preprocessing import MaxAbsScaler
mascaler = MaxAbsScaler()
X_std = mascaler.fit_transform(X)


### Iteratively K-Means cluster the normalized data and generate an interactive line chart showing the average silhouette score for each number of clusters (2 through 20).

In [63]:
from sklearn import metrics
from sklearn.cluster import KMeans
from ipywidgets import interact

sscore = []
for k in range(2,20):
    km = KMeans(n_clusters = k)
    km.fit_transform(X_std)
    sscore.append(silhouette_score(X_std,km.fit_predict(X_std)))

dfclus = pd.DataFrame({'Clusters':[x for x in range(2,20)],'Score':sscore})


In [64]:
fig = px.line(dfclus,x='Clusters', y='Score')   
fig.show()

### Choose a number of clusters, run KMeans with that value for k on the scaled data, and add a column to the original housing data set containing the cluster that each record is assigned to.

In [131]:
km5 = KMeans(n_clusters = 5)
km5.fit_transform(X_std)
dfnum['Cluster'] = km5.fit_predict(X_std)
dfnum

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,Cluster
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,61,0,0,0,0,0,2,2008,208500,1
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,0,0,0,0,0,0,5,2007,181500,2
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,42,0,0,0,0,0,9,2008,223500,1
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,35,272,0,0,0,0,2,2006,140000,2
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,84,0,0,0,0,0,12,2008,250000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,2000,0.0,0,0,953,...,40,0,0,0,0,0,8,2007,175000,1
1456,20,13175,6,6,1978,1988,119.0,790,163,589,...,0,0,0,0,0,0,2,2010,210000,2
1457,70,9042,7,9,1941,2006,0.0,275,0,877,...,60,0,0,0,0,2500,5,2010,266500,1
1458,20,9717,5,6,1950,1996,0.0,49,1029,0,...,0,112,0,0,0,0,4,2010,142125,0


In [132]:
import numpy as np
np.unique(dfnum['Cluster'])

array([0, 1, 2, 3, 4], dtype=int32)

### Create an interactive bar chart that shows the average SalePrice of a property by cluster.

You will need to aggregate the data by cluster and average the sale prices before generating your visualization.

In [133]:
averages = []
for i in range(5):
    average = dfnum.loc[dfnum['Cluster']==i, 'SalePrice'].mean()
    averages.append(average)
dfprice2cluster = pd.DataFrame({'Cluster':[x for x in range(5)], 'Average Price': averages})  

fig = px.bar(dfprice2cluster, x='Cluster', y='Average Price')

fig.show()

### Create another bar chart where the bars are broken down and color-coded by the year the property was sold.

You will need to convert the YrSold field to be categorical in order to separate the bars based on that field.

In [134]:
agg = dfnum.groupby(['YrSold','Cluster']).mean().reset_index()
agg['YrSold'] = agg['YrSold'].apply(str)


In [135]:
fig = px.bar(agg, x='Cluster', y='SalePrice', color='YrSold')
fig.update_layout(barmode='group')
fig.show()

### Add a drop-down widget to the multi-bar chart you created above that lets you choose between 4 numeric fields to represent on the Y axis. 

In [136]:
dfnum.columns

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice',
       'Cluster'],
      dtype='object')

In [137]:
filtered = dfnum[dfnum['YearBuilt']==1990]
agg = filtered.groupby(['YrSold']).mean()
agg

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SalePrice,Cluster
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006,40.0,9609.5,7.0,5.0,1990.0,1990.5,27.0,1134.5,0.0,128.0,...,0.0,29.0,0.0,0.0,45.0,0.0,0.0,5.5,215000.0,1.5
2007,56.0,11800.4,7.0,5.0,1990.0,1990.6,160.6,727.6,0.0,279.4,...,159.0,71.2,44.8,0.0,0.0,0.0,0.0,5.2,202580.0,1.0
2008,60.0,11839.0,7.0,5.0,1990.0,1990.0,99.0,1085.0,0.0,390.0,...,192.0,121.0,0.0,0.0,0.0,0.0,0.0,5.0,262280.0,1.0
2009,40.0,12160.666667,7.0,5.666667,1990.0,1990.666667,78.0,365.333333,59.0,1165.666667,...,84.0,43.666667,0.0,169.333333,0.0,0.0,0.0,5.666667,208466.666667,2.666667
2010,60.0,12376.0,7.0,5.0,1990.0,1990.0,0.0,1470.0,0.0,203.0,...,367.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,320000.0,1.0


In [138]:
@interact(Metric = ['TotalBsmtSF','LotArea','Fireplaces','SalePrice'],
        YearBuilt = (dfnum['YearBuilt'].min(), dfnum['YearBuilt'].max()))

def barchart(Metric, YearBuilt):
    filtered = dfnum[dfnum['YearBuilt']==YearBuilt]
    agg = filtered.groupby(['Cluster','YrSold']).mean().reset_index()
    agg['YrSold'] = agg['YrSold'].apply(str)
    fig = px.bar(agg, x ='Cluster', y = Metric, color = 'YrSold')
    fig.update_layout(barmode= 'group')
    fig.show()


interactive(children=(Dropdown(description='Metric', options=('TotalBsmtSF', 'LotArea', 'Fireplaces', 'SalePri…

### Create a scatter plot that shows the relationship between SalesPrice and LotArea, color-coded by cluster. Add a slider that filters the data by the year the property was sold.

In [146]:
@interact(YearSold = (dfnum['YrSold'].min(), dfnum['YrSold'].max()))
          
def scatter(YearSold):
        filtered = dfnum[dfnum['YrSold']==YearSold]
        #filtered['Cluster'] = filtered.Cluster.astype('object')
        fig = px.scatter(filtered, x ='SalePrice' , y = 'LotArea', color = filtered['Cluster'].astype('object'))
        fig.update_layout(yaxis = dict(range = [0,30000]))
        fig.show()

interactive(children=(IntSlider(value=2008, description='YearSold', max=2010, min=2006), Output()), _dom_class…