In [1]:
import os, sys

import numpy as np
import pandas as pd

from os.path import join

from library import SBKMeans, ErrorChecker
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = False
need_standardization = True
need_pca = True
filename = 'Air_Traffic_Passenger_Statistics.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 5
num_iterations = 50

In [3]:
df = pd.read_csv(datapath, low_memory=False)

In [4]:
print(len(df))

15007


In [5]:
print(df.head())

   Activity Period Operating Airline Operating Airline IATA Code  \
0           200507      ATA Airlines                          TZ   
1           200507      ATA Airlines                          TZ   
2           200507      ATA Airlines                          TZ   
3           200507       Air Canada                           AC   
4           200507       Air Canada                           AC   

  Published Airline Published Airline IATA Code    GEO Summary GEO Region  \
0      ATA Airlines                          TZ       Domestic         US   
1      ATA Airlines                          TZ       Domestic         US   
2      ATA Airlines                          TZ       Domestic         US   
3       Air Canada                           AC  International     Canada   
4       Air Canada                           AC  International     Canada   

  Activity Type Code Price Category Code    Terminal Boarding Area  \
0           Deplaned            Low Fare  Terminal 1      

In [6]:
print(df.dtypes)

Activity Period                 int64
Operating Airline              object
Operating Airline IATA Code    object
Published Airline              object
Published Airline IATA Code    object
GEO Summary                    object
GEO Region                     object
Activity Type Code             object
Price Category Code            object
Terminal                       object
Boarding Area                  object
Passenger Count                 int64
Adjusted Activity Type Code    object
Adjusted Passenger Count        int64
Year                            int64
Month                          object
dtype: object


In [7]:
columns = df.columns.tolist()
cols = columns
# cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
# cols = columns[:-1]
cols = ['Passenger Count', 'Adjusted Passenger Count']
# cols = columns[1:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

15007
Passenger Count             int64
Adjusted Passenger Count    int64
Year                        int64
dtype: object


In [8]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [9]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [10]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [11]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [12]:
data = X
print(data[:5])
print(data.shape)

[[ 0.23605488 -0.05504286]
 [ 0.23622824 -0.04967031]
 [ 0.14473161 -0.06641   ]
 [-0.12742121 -0.04421066]
 [ 0.17152318  0.7638784 ]]
(15007, 2)


In [13]:
sbkmeans = SBKMeans(
    n_clusters=num_clusters,
    n_iters=num_iterations,
)

In [14]:
centers = sbkmeans.fit(data)

Initial centers:
[[-0.12688613 -0.02762816]
 [ 0.23707562 -0.02340951]
 [ 0.05980131  0.12034121]
 [-0.29974314  0.25301764]
 [-0.03632888 -0.03999845]]


In [15]:
print(sbkmeans.centers)

[[-0.28478301 -0.02232993]
 [ 0.25901423 -0.02684961]
 [ 0.15709621  0.38476194]
 [-0.15977654  0.22904207]
 [-0.03654413 -0.046668  ]]


In [16]:
labels = sbkmeans.predict(data)

In [17]:
print(labels[:5])

[1, 1, 1, 4, 2]


In [18]:
error_checker = ErrorChecker(
    X=data,
    centers=centers,
    labels=labels,
)

In [19]:
dist_total = error_checker.potential_function()

In [20]:
print(dist_total)

13333.200489232195


In [21]:
kmeans = KMeans(
    n_clusters=num_clusters,
    max_iter=num_iterations,
    init='k-means++',
    algorithm='auto',
#     n_init=50,
)

In [22]:
labels = kmeans.fit_predict(data)

In [23]:
error_checker = ErrorChecker(
    data,
    kmeans.cluster_centers_,
    kmeans.labels_,
)

In [24]:
dist_total = error_checker.potential_function()

In [25]:
print(dist_total)

19274.475970690422
