In [0]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
from sklearn.cluster import MeanShift, estimate_bandwidth

#Reads file with merged city data along with employment prediction information
#Includes original data, min-max normalized data, and z-scores
combo = pd.read_csv("merged_city_data_normalized_with_employment.csv", encoding = 'ISO-8859-1')
combo = combo.drop_duplicates(subset=['GeoID'])

#Gets the name and GeoID of each city
names = combo[['GeoID', 'Place Name']]

#Gets a list of the affiliate cities
aff = pd.read_csv("Affiliate-City-to-Id2.csv", encoding = 'ISO-8859-1')
aff = aff[['Geoid']]
aff.columns = ['GeoID']
aff['Affiliate'] = pd.Series(['Yes']*160)


In [0]:
#Fetchs min-max normalized data from table and stores into data
data = combo.iloc[:,27:44]
data['Employment Rate Prediction min_max_normalized'] = combo['Employment Rate Prediction min_max_normalized']

#Fills NaN values with 0's
data = data.fillna(0)

#Stores into numpy array
arr = data.values

In [10]:
#Bandwidth that creates a good number of clusters
bandwidth = 0.35

#Performs meanshift clustering
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(arr)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique, counts = np.unique(labels, return_counts=True)
countClusters = dict(zip(labels_unique, counts))
n_clusters_ = len(labels_unique)

print(countClusters)

{0: 1872, 1: 1703, 2: 1299, 3: 114, 4: 14, 5: 14, 6: 32, 7: 35, 8: 3, 9: 1, 10: 8, 11: 1, 12: 29, 13: 1, 14: 20, 15: 79, 16: 9, 17: 10, 18: 31, 19: 7, 20: 12, 21: 1, 22: 116, 23: 3, 24: 36, 25: 8, 26: 2, 27: 121, 28: 1, 29: 1, 30: 1, 31: 128, 32: 1, 33: 2, 34: 15, 35: 41, 36: 1, 37: 11}


In [0]:
#Creates a new dataframe to store the results
res = pd.DataFrame(combo['GeoID'])

#Adds cluster labels, city names, and affiliate status to dataframe
res['Cluster'] = pd.Series(labels)
res = res.merge(names, on = 'GeoID')
res = res.merge(aff, how = 'left', on = 'GeoID')

#Outputs result as a csv
res.to_csv('clusters.csv')

#Creates a dataframe with all of the cluster centers and outputs as a csv
cc = pd.DataFrame(cluster_centers)
cc.to_csv("cluster_center.csv")

In [0]:
#Isolates city GeoID of each of the three major clusters
cluster1 = pd.DataFrame((res.loc[res['Cluster'] == 0])['GeoID'])
cluster2 = pd.DataFrame((res.loc[res['Cluster'] == 1])['GeoID'])
cluster3 = pd.DataFrame((res.loc[res['Cluster'] == 2])['GeoID'])

#Gets city statistics for each of the three major clusters
cluster1 = cluster1.merge(combo, on = 'GeoID', how = 'left')
cluster2 = cluster2.merge(combo, on = 'GeoID', how = 'left')
cluster3 = cluster3.merge(combo, on = 'GeoID', how = 'left')

#Gets means and standard deviations for the cities in each of the three major clusters
df3 = pd.DataFrame(cluster1.mean())
df3["Cluster 1 Standard Dev"] = pd.Series(cluster1.std())
df3["Cluster 2 Mean"] = pd.Series(cluster2.mean())
df3["Cluster 2 Standard Deviation"] = pd.Series(cluster2.std())
df3["Cluster 3 Mean"] = pd.Series(cluster3.mean())
df3["Cluster 3 Standard Deviation"] = pd.Series(cluster3.std())

#Outputs cluster statistics as a csv
df3.to_csv('cluster_stats.csv')


In [13]:
affCluster = res.loc[res['Affiliate'] == 'Yes']
counts = affCluster['Cluster'].value_counts()

#Prints out the number of affiliate cities in each of the clusters
print(counts)
#Prints out cluster assignment for each of the affiliate cities
print(affCluster)


1     98
2     29
15    11
3      7
12     5
24     2
7      2
35     1
20     1
18     1
9      1
6      1
0      1
Name: Cluster, dtype: int64
        GeoID  Cluster                             Place Name Affiliate
55     150000        1                   Mobile city, Alabama       Yes
100    203000        1         Anchorage municipality, Alaska       Yes
168    455000        1                  Phoenix city, Arizona       Yes
194    477000        1                   Tucson city, Arizona       Yes
253    566080        2              Springdale city, Arkansas       Yes
274    602000        1               Anaheim city, California       Yes
458    629000        1          Garden Grove city, California       Yes
460    630000        1              Glendale city, California       Yes
563    644000        1           Los Angeles city, California       Yes
565    644112        1             Los Gatos town, California       Yes
596    648354        1               Modesto city, California  