## Cluster zip codes using k-means

In [1]:
# clustering
from sklearn.cluster import KMeans
import pandas as pd
from typing import List
import numpy as np

### Filter data

In [2]:
def new_filter_signal(data_path: str, signal_strength: List):
    df = pd.read_csv(data_path, usecols=['zipcode', 'callsign', 'signal_strength'], dtype={'zipcode': str})
    # For test
    #df = df.sample(1000, random_state=10)
    cdf = df.groupby(['zipcode', 'callsign'], dropna=False).agg({'signal_strength': min}).unstack(fill_value=0).reset_index()
    cdf.columns = ['zipcode'] + cdf.columns.get_level_values(1).tolist()[1:]
    del cdf[np.nan]
    to_replace = {}
    for ss in [1, 2, 3, 4]:
        to_replace[ss] = 1 if ss in signal_strength else 0
    cdf.replace(to_replace, inplace=True)
    return cdf.set_index('zipcode')

In [3]:
data_path = "./stations.csv"
signal_strength = [1, 2]

df = new_filter_signal(data_path, signal_strength)

In [4]:
df.head(10)

Unnamed: 0_level_0,K04QP-D,K09YZ-D,K10OG-D,K17ED-D,K17GD-D,K17JI-D,K19GH-D,K20DN-D,K20JX-D,K21DO-D,...,WZDX,WZMQ,WZPX-TV,WZRA-CD,WZRB,WZTV,WZVI,WZVN-TV,WZXZ-CD,WZZM
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
print(df.shape)
# Take out rows with all zeroes
df = df.loc[(df.sum(axis=1) != 0)]
print(df.shape)

(36094, 2027)
(33790, 2027)


In [6]:
model = KMeans(n_clusters=200)
X= df.iloc[:,:-1].values
model.fit(X)
yhat = model.predict(X)

In [7]:
df['k_means_cluster'] = pd.Series(yhat, index=df.index)
df['zipcode'] = df.index

In [8]:
df.head(10)

Unnamed: 0_level_0,K04QP-D,K09YZ-D,K10OG-D,K17ED-D,K17GD-D,K17JI-D,K19GH-D,K20DN-D,K20JX-D,K21DO-D,...,WZPX-TV,WZRA-CD,WZRB,WZTV,WZVI,WZVN-TV,WZXZ-CD,WZZM,k_means_cluster,zipcode
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,134,501
544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,134,544
601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,601
602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,602
603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,603
606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,606
610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,610
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,65,612
616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,616
617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,65,617


In [9]:
df["zipcode"] = df["zipcode"].astype(str)

In [12]:
# saving to CSV was causing zipcode to come back up as numeric
df.to_pickle("k_means_200.pkl", compression = "xz")

In [14]:
pd.read_pickle("k_means_200.pkl", compression = "xz").head()

Unnamed: 0_level_0,K04QP-D,K09YZ-D,K10OG-D,K17ED-D,K17GD-D,K17JI-D,K19GH-D,K20DN-D,K20JX-D,K21DO-D,...,WZPX-TV,WZRA-CD,WZRB,WZTV,WZVI,WZVN-TV,WZXZ-CD,WZZM,k_means_cluster,zipcode
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,134,501
544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,134,544
601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,601
602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,602
603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,603
