# Validation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from statsmodels.graphics.gofplots import qqplot
import scipy.stats as stats

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
cdc = pd.read_csv('500_Cities_CDC.csv')

Convert Geolocation to latitude and longitude

In [3]:
lat_lon = cdc['Geolocation'].str.split(', ', expand= True)
lat_lon[0] = lat_lon[0].str.replace('(', '')
lat_lon[1] = lat_lon[1].str.replace(')', '')
lat_lon = lat_lon.astype('double')

In [4]:
cdc['Lat'] = lat_lon[0]
cdc['Lon'] = lat_lon[1]

cdc = cdc.drop(columns=['Geolocation', 'StateAbbr', 'PlaceName', 'PlaceFIPS', 'BINGE_AdjPrev'])

Filter to only the age-adjusted columns

In [5]:
age_cols = list(cdc.filter(like='AdjPrev'))

In [6]:
cdc_age = cdc[age_cols]

Scale the data

In [7]:
scaler = StandardScaler()

cdc_age_scaled = pd.DataFrame(scaler.fit_transform(cdc_age), index = cdc_age.index, columns = cdc_age.columns)

Sample 80% of the data to train the model on

In [8]:
X = cdc_age_scaled.sample(400, random_state = 25)
X_index = X.index
y = cdc_age_scaled.drop(X_index, axis = 0)

Fit the model

In [9]:
k =4 
kmeans_age = KMeans(n_clusters = k)
kmeans_age.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

Add labels to X

In [10]:
X['label'] = kmeans_age.labels_

Predict labels for y

In [12]:
y_labels = kmeans_age.predict(y)

# Fit the model to all of the data

In [13]:
k =4 
kmeans_age_all = KMeans(n_clusters = k)
kmeans_age_all.fit(cdc_age_scaled)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

Add labels

In [14]:
cdc_age_scaled['label'] = kmeans_age_all.labels_

Find the true labels of y 

In [16]:
y_true = cdc_age_scaled.drop(X_index, axis = 0)
y_true_labels = y_true['label']

Since KMeans randomly assigns labels to groups, I need to relabel the X and y labels to match the true labels. To do this, I will match the means of each group.

In [17]:
true_mean = cdc_age_scaled.groupby('label').mean().mean(axis = 1)

label
0    0.359656
1   -0.129224
2    0.805004
3   -0.402923
dtype: float64

In [18]:
X_mean = X.groupby('label').mean().mean(axis = 1)

label
0    0.360174
1   -0.399148
2   -0.120882
3    0.800104
dtype: float64

In [19]:
dist_map = {}
for label in range(len(X_mean)):
    winner = None
    smallest_dist = 9999
    for j in range(len(true_mean)):
        dist = np.abs(X_mean[label] - true_mean[j])
        if dist < smallest_dist:
            winner = j
            smallest_dist = dist
    dist_map[label] = winner  
    

{0: 0, 1: 3, 2: 1, 3: 2}


Assign the correct labels to X and y

In [20]:
X['label'] = X['label'].replace(dist_map)

y_relabeled = [dist_map[i] for i in y_labels]


What percent of labels in X match the true labels?

In [21]:
(y_relabeled == y_true_labels).mean()

1.0

What percent of labels in y match the true labels?

In [22]:
X_test = cdc_age_scaled.loc[X_index]

(X_test['label'] == X['label']).mean()

0.99