# Clustering International Flight Route From San Francisco Airport 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import csv

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)


## Load Data

In [None]:
df = pd.read_csv('../input/air-travel-data-set-of-sans-francisco-airport/air travel dataset.xlsx - Air travel data.csv')
df.head()

## Data Cleansing
Removing null value from data

In [None]:
df.dropna(axis=0,how='any',subset=['Geometry Coordinates 1 0','Geometry Coordinates 1 1'],inplace=True)

## Data Enrichment
Add more information to data eg: separate flight route and remove SFO inbound flight

In [None]:
df_iata = pd.read_csv('../input/airport-codes/airports.csv')
df_iata.head()

Split Route data (SFO-NRT) to 2 columns (ff_from and ff_to) SFO and NRT

In [None]:
df[['ff_from','ff_to']] = df.Route.apply(
   lambda x: pd.Series(str(x).split("-")))

In [None]:
del df['Route']
df.head()

In [None]:
x = df.loc[:, ['Geometry Coordinates 1 0','Geometry Coordinates 1 1', 'Properties Flysfo Actual Timestamp', 'ff_from', 'ff_to']]
x.head()

Hapus flight data yang bukan dari SFO dan memiliki tujuan SFO. Menyisakan hanya data penerbangan dari SFO dan bukan ke SFO.

In [None]:
x = x[x.ff_from == 'SFO']
x = x[x.ff_to != 'SFO']
x.info()

Hapus flight yang bukan penerbangan internasional.

In [None]:
x = pd.merge(x, df_iata[["IATA", "Country"]], how="left", left_on='ff_from', right_on='IATA')
x.rename({'Country': 'ff_from_country'}, axis=1, inplace=True)
del x['IATA']
x = pd.merge(x, df_iata[["IATA", "Country"]], how="left", left_on='ff_to', right_on='IATA')
x.rename({'Country': 'ff_to_country'}, axis=1, inplace=True)
del x['IATA']
x['flight_type'] = x.apply(lambda x: int(x['ff_to_country'] == x['ff_from_country']), axis=1)

In [None]:
x = x[x.flight_type != 1]
del x['ff_from']
del x['ff_from_country']
del x['flight_type']
del x['ff_to']
x.head()

In [None]:
x.info()

Membersihkan data tersebut menyisakan kita dengan data penerbangan internasional dari SFO sebanyak 6256 penerbangan yang terjadi selama bulan Maret 2020

## The Elbow Method
https://skillplus.web.id/elbow-method/

Elbow method adalah metoda yang sering dipakai untuk menentukan jumlah cluster yang akan digunakan pada k-means clustering.

Seperti yang sudah dibahas sebelumnya, clustering adalah meminimumkan jarak antara data point dan centroid, serta memaksimumkan jarak antara centroid yang dihitung menggunakan within-cluster sum of squares atau WCSS.

Tujuannya adalah menghitung WCSS se-minimum dengan jumlah cluster yang kecil agar bisa dilakukan interpretasi data.

1. WCSS = 0, berarti semua data point berada pada cluster yang berbeda, contoh 6 data points, jumlah cluster 6. Tidak terjadi clustering alias clustering tidak berguna.
2. WCSS = 1, berarti semua data point berada dalam satu cluster, Sama seperti diatas, clustering tidak berguna.

In [None]:
enc = LabelEncoder()
enc.fit(x['ff_to_country'])
x['ff_to_country'] = enc.transform(x['ff_to_country'])
x.head()

In [None]:
del x['ff_to_country']

Mengelompokkan jam keberangkatan berdasarkan jam dalam (00-23)

In [None]:
x['flight_hour'] = x.apply(
    lambda x: (pd.to_datetime(x['Properties Flysfo Actual Timestamp'], unit='s', origin='unix').hour * 60 + pd.to_datetime(x['Properties Flysfo Actual Timestamp'], unit='s', origin='unix').minute) / 60
    , axis=1)
del x['Properties Flysfo Actual Timestamp']
x.head()

In [None]:
from geopy import distance

def getJarak(lat, lon):
    sfo = (37.61799, -122.370943)
    point = (lon, lat)
    
    return distance.distance(sfo, point).km

In [None]:
print(getJarak(-99.080804, 19.431302))

In [None]:
x['distance'] = x.apply(lambda x: getJarak(x['Geometry Coordinates 1 0'], x['Geometry Coordinates 1 1']), axis=1)

In [None]:
x.head()

In [None]:
del x['Geometry Coordinates 1 0']
del x['Geometry Coordinates 1 1']

In [None]:
x.head()

In [None]:
x = x.values

In [None]:
wcss = []

number_of_cluster = range(1,10)

for i in number_of_cluster:
    kmeans = KMeans(i)
    kmeans.fit(x)
    wcss_iter = kmeans.inertia_
    wcss.append(wcss_iter)

plt.plot(number_of_cluster, wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Score')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(x)

In [None]:
plt.figure(figsize=(15,7))
sns.scatterplot(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], color = 'yellow', label = 'Long Range',s=50)
sns.scatterplot(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], color = 'blue', label = 'Medium Range',s=50)
sns.scatterplot(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], color = 'green', label = 'Short Range',s=50)
sns.scatterplot(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], color = 'red', 
                label = 'Centroids',s=300,marker=',')
plt.grid(False)
plt.title('Clusters of flights')
plt.ylabel('Jarak')
plt.xlabel('Jam Keberangkatan')
plt.xlim(0, 24)
plt.legend()
plt.show()