# Unsupervised Learning with Scikit-Learn
* Week 7 - Day 3

> We will explore k-means with scikit-learn using our census data. We first load the data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
census = pd.read_csv('../data/acs2015_county_data.csv')
census.head()

Unnamed: 0,CensusId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001,Alabama,Autauga,55221,26745,28476,2.6,75.8,18.5,0.4,...,0.5,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6
1,1003,Alabama,Baldwin,195121,95314,99807,4.5,83.1,9.5,0.6,...,1.0,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5
2,1005,Alabama,Barbour,26932,14497,12435,4.6,46.2,46.7,0.2,...,1.8,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6
3,1007,Alabama,Bibb,22604,12073,10531,2.2,74.5,21.4,0.4,...,0.6,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3
4,1009,Alabama,Blount,57710,28512,29198,8.6,87.9,1.5,0.3,...,0.9,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7


In [3]:
census.describe()

Unnamed: 0,CensusId,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
count,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,...,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0,3220.0
mean,31393.60528,99409.35,48896.94,50512.41,11.011522,75.428789,8.665497,1.723509,1.229068,0.082733,...,3.323509,1.612733,4.63177,23.278758,45593.52,74.219348,17.56087,7.931801,0.288106,8.094441
std,16292.078954,319305.5,156681.3,162662.0,19.24138,22.93289,14.279122,7.253115,2.633079,0.734931,...,3.756096,1.670988,3.178772,5.600466,149699.5,7.863188,6.510354,3.914974,0.455137,4.096114
min,1001.0,85.0,42.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.9,62.0,25.0,5.8,0.0,0.0,0.0
25%,19032.5,11218.0,5637.25,5572.0,1.9,64.1,0.5,0.1,0.2,0.0,...,1.4,0.9,2.7,19.5,4550.75,70.5,13.1,5.4,0.1,5.5
50%,30024.0,26035.0,12932.0,13057.0,3.9,84.1,1.9,0.3,0.5,0.0,...,2.4,1.3,3.9,23.0,10508.0,75.7,16.2,6.9,0.2,7.6
75%,46105.5,66430.5,32992.75,33487.5,9.825,93.2,9.6,0.6,1.2,0.0,...,4.0,1.9,5.6,26.8,28632.75,79.7,20.5,9.4,0.3,9.9
max,72153.0,10038390.0,4945351.0,5093037.0,99.9,99.8,85.9,92.1,41.6,35.3,...,71.2,39.1,37.2,44.0,4635465.0,88.3,66.2,36.6,9.8,36.5


> Before using our algorithm, we need to do some munging. Our first step should be to check for missing data and based on the amount of missing data decide on a strategy.

In [4]:
# Finding missing values
census.isnull().sum()

CensusId           0
State              0
County             0
TotalPop           0
Men                0
Women              0
Hispanic           0
White              0
Black              0
Native             0
Asian              0
Pacific            0
Citizen            0
Income             1
IncomeErr          1
IncomePerCap       0
IncomePerCapErr    0
Poverty            0
ChildPoverty       1
Professional       0
Service            0
Office             0
Construction       0
Production         0
Drive              0
Carpool            0
Transit            0
Walk               0
OtherTransp        0
WorkAtHome         0
MeanCommute        0
Employed           0
PrivateWork        0
PublicWork         0
SelfEmployed       0
FamilyWork         0
Unemployment       0
dtype: int64

In [5]:
#census_missing.info()

In [6]:
#census_missing.select_dtypes(exclude='number').columns

> There are only a few columns with missing data and each one of them does not have more than one missing observation. Therefore, the simplest strategy would be to remove the missing data.

In [7]:
census_missing = census.dropna()
census_missing

Unnamed: 0,CensusId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001,Alabama,Autauga,55221,26745,28476,2.6,75.8,18.5,0.4,...,0.5,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6
1,1003,Alabama,Baldwin,195121,95314,99807,4.5,83.1,9.5,0.6,...,1.0,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5
2,1005,Alabama,Barbour,26932,14497,12435,4.6,46.2,46.7,0.2,...,1.8,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6
3,1007,Alabama,Bibb,22604,12073,10531,2.2,74.5,21.4,0.4,...,0.6,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3
4,1009,Alabama,Blount,57710,28512,29198,8.6,87.9,1.5,0.3,...,0.9,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,72145,Puerto Rico,Vega Baja,56858,27379,29479,96.4,3.4,0.1,0.0,...,1.2,1.3,0.3,32.0,13660,78.3,17.6,4.1,0.0,15.2
3216,72147,Puerto Rico,Vieques,9130,4585,4545,96.7,2.9,0.0,0.0,...,10.8,0.0,1.4,14.0,2860,44.5,41.6,13.6,0.3,12.2
3217,72149,Puerto Rico,Villalba,24685,12086,12599,99.7,0.0,0.0,0.0,...,3.2,0.0,3.3,26.9,6795,59.2,27.5,13.1,0.2,25.9
3218,72151,Puerto Rico,Yabucoa,36279,17648,18631,99.8,0.2,0.0,0.0,...,2.3,2.3,1.5,29.5,8083,65.1,27.6,7.3,0.0,24.3


>Additionally, we should only be clustering using columns that contain actual information about the data. Therefore, we should probably remove the State and County columns. We should also remove the CensusId column because it contains no information about the each county.

In [8]:
census_columns = [ col for col in census.columns.values if col not in ['CensusId', 'State', 'County']]
census_columns

['TotalPop',
 'Men',
 'Women',
 'Hispanic',
 'White',
 'Black',
 'Native',
 'Asian',
 'Pacific',
 'Citizen',
 'Income',
 'IncomeErr',
 'IncomePerCap',
 'IncomePerCapErr',
 'Poverty',
 'ChildPoverty',
 'Professional',
 'Service',
 'Office',
 'Construction',
 'Production',
 'Drive',
 'Carpool',
 'Transit',
 'Walk',
 'OtherTransp',
 'WorkAtHome',
 'MeanCommute',
 'Employed',
 'PrivateWork',
 'PublicWork',
 'SelfEmployed',
 'FamilyWork',
 'Unemployment']

> Now let's import Kmeans from scikit-learn:

In [9]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)

>We define a k-means object with 4 clusters and then fit our data

In [10]:
census_clusters = kmeans.fit(census_missing[census_columns])

In [11]:
census_clusters.cluster_centers_

array([[7.14995000e+05, 3.48954639e+05, 3.66040361e+05, 1.67854430e+01,
        6.02544304e+01, 1.41050633e+01, 3.38607595e-01, 5.71392405e+00,
        1.68354430e-01, 4.95983386e+05, 6.12043038e+04, 8.54474684e+02,
        3.16039367e+04, 3.98056962e+02, 1.41791139e+01, 1.99753165e+01,
        3.92006329e+01, 1.76835443e+01, 2.45822785e+01, 8.00569620e+00,
        1.05272152e+01, 7.64436709e+01, 9.11392405e+00, 5.50126582e+00,
        2.59430380e+00, 1.83607595e+00, 4.50379747e+00, 2.63639241e+01,
        3.41134006e+05, 8.07563291e+01, 1.36443038e+01, 5.46392405e+00,
        1.39240506e-01, 8.22974684e+00],
       [2.59497053e+06, 1.27467168e+06, 1.32029884e+06, 3.28736842e+01,
        3.98684211e+01, 1.44368421e+01, 3.31578947e-01, 9.75789474e+00,
        1.89473684e-01, 1.65020984e+06, 5.85898947e+04, 4.97473684e+02,
        3.08353158e+04, 2.48105263e+02, 1.68368421e+01, 2.35842105e+01,
        3.71842105e+01, 1.95105263e+01, 2.45473684e+01, 8.02631579e+00,
        1.07368421e+01,

> The cluster centers contain the 4 centroids. Since the data contains 34 columns describing each county, each centroid is in a 34 dimensional plane.

> Using fit_predict, we can assign a cluster to each observation and then add this information back to our dataset.

In [12]:
census_missing['Cluster'] =census_clusters.fit_predict(census_missing[census_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
census_missing.head()

Unnamed: 0,CensusId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,Cluster
0,1001,Alabama,Autauga,55221,26745,28476,2.6,75.8,18.5,0.4,...,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6,0
1,1003,Alabama,Baldwin,195121,95314,99807,4.5,83.1,9.5,0.6,...,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5,0
2,1005,Alabama,Barbour,26932,14497,12435,4.6,46.2,46.7,0.2,...,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6,0
3,1007,Alabama,Bibb,22604,12073,10531,2.2,74.5,21.4,0.4,...,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3,0
4,1009,Alabama,Blount,57710,28512,29198,8.6,87.9,1.5,0.3,...,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7,0


> Let's look at the counts of counties in each cluster:

In [14]:
census_missing.Cluster.value_counts()

0    3040
2     158
1      19
3       1
Name: Cluster, dtype: int64

> The majority of the data is in the first cluster, while cluster 2 has only one obervation.

> Plotting the data will not provide us with a great deal of meaningful information. This is because the data has 34 dimensions. Therefore, creating a two dimensional plot will only capture some of the information and might not show completely separable clusters. However, it is interesting to look at some summary statistics for our clusters.