<font size="+3"><strong>6.5. Small Business Owners in the United States🇺🇸</strong></font>

In [None]:
# Import libraries here

import pandas as pd
import plotly.express as px
import wqet_grader
from IPython.display import VimeoVideo
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Prepare Data

## Import

In [None]:
df = pd.read_csv("data/SCFP2019.csv.gz")
print("df shape:", df.shape)
df.head()

In [None]:
prop_biz_owners = len(df[df["HBUS"]==1])/len(df)
print("proportion of business owners in df:", prop_biz_owners)

In [None]:
inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = df["INCCAT"].replace(inccat_dict).groupby(df["HBUS"]).value_counts(normalize=True).rename("frequency").to_frame().reset_index()

df_inccat

In [None]:
# Create bar chart of `df_inccat`
sns.barplot(
    x="INCCAT",
    y="frequency",
    hue="HBUS",
    data=df_inccat,
    order=inccat_dict.values()
)
plt.xlabel("Income Category")
plt.ylabel("Frequency (%)")
plt.title("Income Distribution: Business Owners vs. Non-Business Owners");
# Don't delete the code below 👇
plt.savefig("images/6-5-4.png", dpi=150)


In [None]:
# Plot "HOUSES" vs "DEBT" with hue=label
sns.scatterplot(
    x="HOUSES",
    y="DEBT",
    hue="HBUS",
    data=df,
)
plt.xlabel("Household Debt")
plt.ylabel("Home Value")
plt.title("Home Value vs. Household Debt");
# Don't delete the code below 👇
plt.savefig("images/6-5-5.png", dpi=150)


In [None]:
mask = (df["HBUS"]==1)&(df["INCOME"]<500000.0)
df_small_biz = df[mask]
print("df_small_biz shape:", df_small_biz.shape)
df_small_biz.head()

In [None]:
# Plot histogram of "AGE"
df_small_biz["AGE"].hist(bins=10)
plt.xlabel("AGE")
plt.ylabel("Frequency (count)")
plt.title("Small Business Owners: Age Distribution")
# Don't delete the code below 👇
plt.savefig("images/6-5-7.png", dpi=150)


In [None]:
# Calculate variance, get 10 largest features
top_ten_var = df_small_biz.var().sort_values().tail(10)
top_ten_var

In [None]:
# Calculate trimmed variance
top_ten_trim_var = df_small_biz.apply(trimmed_var).sort_values().tail(10)
top_ten_trim_var

In [None]:
# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x=top_ten_trim_var,
    y=top_ten_trim_var.index,
    title="Small Business Owners: High Variance Features"
)
fig.update_layout(xaxis_title="Trimmed Variance [$]", yaxis_title="Feature")
# Don't delete the code below 👇
fig.write_image("images/6-5-10.png", scale=1, height=500, width=700)

fig.show()

In [None]:
high_var_cols = top_ten_trim_var.tail(5).index.to_list()
high_var_cols

## Split

In [None]:
X = df_small_biz[high_var_cols]
print("X shape:", X.shape)

# Build Model

## Iterate

In [None]:
n_clusters = range(2, 13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters=k, random_state=42)
    )
    model.fit(X)
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    silhouette_scores.append(silhouette_score(X, model.named_steps["kmeans"].labels_))
    

print("Inertia:", inertia_errors[:11])
print()
print("Silhouette Scores:", silhouette_scores[:3])

In [None]:
# Create line plot of `inertia_errors` vs `n_clusters`
fig = px.line(
    x=n_clusters,
    y=inertia_errors,
    title="K-Means Model: Inertia vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Inertia")
# Don't delete the code below 👇
fig.write_image("images/6-5-14.png", scale=1, height=500, width=700)

fig.show()

In [None]:
# Create a line plot of `silhouette_scores` vs `n_clusters`
fig = px.line(
    x=n_clusters,
    y=silhouette_scores,
    title="K-Means Model: Inertia vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Silhouette Score")
# Don't delete the code below 👇
fig.write_image("images/6-5-15.png", scale=1, height=500, width=700)

fig.show()

# Communicate

In [None]:
labels = final_model.named_steps["kmeans"].labels_
xgb = X.groupby(labels).mean()
xgb

In [None]:
# Create side-by-side bar chart of `xgb`
fig = px.bar(
    xgb,
    barmode="group",
    title="Small Business Owner Finances by Cluster"
)
fig.update_layout(xaxis_title="Cluster", yaxis_title="Value [$]")
# Don't delete the code below 👇
fig.write_image("images/6-5-18.png", scale=1, height=500, width=700)

fig.show()

In [None]:
# Instantiate transformer
pca = PCA(n_components=2, random_state=42)

# Transform `X`
X_t = pca.fit_transform(X)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])

print("X_pca shape:", X_pca.shape)
X_pca.head()

In [None]:
# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame=X_pca,
    x="PC1",
    y="PC2",
    color=labels,
    title="PCA Representation of Clusters"
)
fig.update_layout(xaxis_title="PCA1", yaxis_title="PCA2")
# Don't delete the code below 👇
fig.write_image("images/6-5-20.png", scale=1, height=500, width=700)

fig.show()