In [1]:
import pandas as pd
import numpy as np
import os
import io
import boto3
import sagemaker

sagemaker.__version__

'1.72.0'

In [2]:
# DATA (most complicated way to get a csv fromm a s3 bucket)

s3_client = boto3.client('s3')
bucket_name='aws-ml-blog-sagemaker-census-segmentation'

# get a list of objects in the bucket
obj_list=s3_client.list_objects(Bucket=bucket_name)

# print object(s)in S3 bucket
files=[]
for contents in obj_list['Contents']:
    files.append(contents['Key'])
    
file_name=files[0]

data_object = s3_client.get_object(Bucket=bucket_name, Key=file_name)

data_body = data_object["Body"].read()

# read in bytes data
data_stream = io.BytesIO(data_body)

# create a dataframe
counties_df = pd.read_csv(data_stream, header=0, delimiter=",") 
counties_df.head()

Unnamed: 0,CensusId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001,Alabama,Autauga,55221,26745,28476,2.6,75.8,18.5,0.4,...,0.5,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6
1,1003,Alabama,Baldwin,195121,95314,99807,4.5,83.1,9.5,0.6,...,1.0,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5
2,1005,Alabama,Barbour,26932,14497,12435,4.6,46.2,46.7,0.2,...,1.8,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6
3,1007,Alabama,Bibb,22604,12073,10531,2.2,74.5,21.4,0.4,...,0.6,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3
4,1009,Alabama,Blount,57710,28512,29198,8.6,87.9,1.5,0.3,...,0.9,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7


In [3]:
# Preprocessing I

clean_counties_df = counties_df.dropna()
clean_counties_df.index = clean_counties_df.State + "-" + clean_counties_df.County
clean_counties_df = clean_counties_df[clean_counties_df.columns[3:]]

clean_counties_df.head(3)

Unnamed: 0,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,Citizen,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
Alabama-Autauga,55221,26745,28476,2.6,75.8,18.5,0.4,1.0,0.0,40725,...,0.5,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6
Alabama-Baldwin,195121,95314,99807,4.5,83.1,9.5,0.6,0.7,0.0,147695,...,1.0,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5
Alabama-Barbour,26932,14497,12435,4.6,46.2,46.7,0.2,0.4,0.0,20714,...,1.8,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6


In [4]:
# Preprocessing II

from sklearn.preprocessing import MinMaxScaler


# Make sure all features are numerical (N/As were removed before)
assert all([clean_counties_df[c].dtype in ['int64','float64'] for c in clean_counties_df.columns]) == True


scaler = MinMaxScaler()
scaler.fit(clean_counties_df)

counties_scaled = scaler.transform(clean_counties_df)
counties_scaled = pd.DataFrame(counties_scaled, 
                               index=clean_counties_df.index, 
                               columns=clean_counties_df.columns)
counties_scaled.head(3)

Unnamed: 0,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,Citizen,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
Alabama-Autauga,0.005475,0.005381,0.005566,0.026026,0.759519,0.215367,0.004343,0.024038,0.0,0.006702,...,0.007022,0.033248,0.048387,0.55243,0.005139,0.75,0.25,0.150273,0.0,0.208219
Alabama-Baldwin,0.019411,0.019246,0.019572,0.045045,0.832665,0.110594,0.006515,0.016827,0.0,0.024393,...,0.014045,0.035806,0.104839,0.549872,0.018507,0.884354,0.107616,0.15847,0.040816,0.205479
Alabama-Barbour,0.002656,0.002904,0.002416,0.046046,0.462926,0.543655,0.002172,0.009615,0.0,0.003393,...,0.025281,0.038363,0.043011,0.491049,0.001819,0.719388,0.248344,0.199454,0.010204,0.482192


In [6]:
# convert df to np array
train_data_np = counties_scaled.values.astype('float32')
train_data_np

array([[0.00547453, 0.00538076, 0.00556558, ..., 0.15027322, 0.        ,
        0.20821919],
       [0.0194114 , 0.01924648, 0.01957154, ..., 0.15846995, 0.04081633,
        0.20547946],
       [0.00265637, 0.00290402, 0.00241591, ..., 0.19945355, 0.01020408,
        0.48219177],
       ...,
       [0.00243253, 0.00241648, 0.00244811, ..., 0.3579235 , 0.02040816,
        0.70958906],
       [0.00358752, 0.0035412 , 0.0036325 , ..., 0.19945355, 0.        ,
        0.6657534 ],
       [0.00390581, 0.0038241 , 0.00398515, ..., 0.12021858, 0.        ,
        0.74246573]], dtype=float32)

In [39]:
train_data_np.shape

(3218, 34)

In [40]:
from sklearn.decomposition import PCA

pca = PCA(n_components=train_data_np.shape[1]-1)

fit_pca = pca.fit(train_data_np)
reduced = fit_pca.transform(train_data_np)

reduced.shape

(3218, 33)

In [18]:
v = pd.DataFrame(fit_pca.components_)
s = pd.DataFrame(fit_pca.singular_values_)

In [33]:
n_principal_components = 5

s.iloc[:n_principal_components]

Unnamed: 0,0
0,19.592178
1,13.03598
2,11.718249
3,10.180058
4,7.991314


In [38]:
# Explained variance of the top n principal components
fit_pca.explained_variance_ratio_[:n_principal_components].sum()

0.7179824