# Dask Implementation

In [1]:
import numpy as np
import dask
import dask.dataframe as dd
from dask.distributed import Client
import joblib
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVC

SEED = 42

In [2]:
client = Client('172.31.88.97:8786')
client = client.restart()
print(client)

<Client: 'tcp://172.31.88.97:8786' processes=0 threads=0, memory=0 B>


## Read data

In [3]:
full_data = dd.read_csv('/home/ubuntu/data/cleaned_data.csv', sample=2**30, assume_missing=False)

In [7]:
print(len(full_data.index))

39435236


In [8]:
full_data.head(5)

Unnamed: 0,legId,searchDate,flightDate,destinationAirport,baseFare,seatsRemaining,totalTravelDistance,durationSeconds,dateDelta
0,222cfd6d1b0d5732602a3e82ad7730c3,0,45,BOS,65.48,4,947,17400,45
1,71cf5163f5efbd007c87aeef85e0c2cc,0,45,BOS,161.86,9,947,9420,45
2,141ef83862caac6be402158433b55c1f,0,45,BOS,161.86,2,947,9660,45
3,5921ef14d28a822fc25eb9d7879134a9,0,45,BOS,158.14,3,956,21780,45
4,3ce43c14cd65f1d4303a4b90093f328c,0,45,BOS,151.63,9,947,29580,45


### Create different sized dataframes to test speed and size scaling

In [4]:
small_data = full_data.sample(frac=0.4, random_state=SEED)

In [9]:
medium_data = full_data.sample(frac=0.5, random_state=SEED)

In [4]:
large_data = full_data.sample(frac=0.7, random_state=SEED)

In [7]:
# Old data format issues
#data = data.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)

In [8]:
# Old data management code
#data['dateDelta'] = data['flightDate'] - data['searchDate']

### Split into train and test DO WE NEED SEEDING HERE FOR REPLICATION ACROSS NOTEBOOKS?

In [5]:
# Split data into X and y
X_reg = np.asarray(small_data[['dateDelta', 'seatsRemaining', 'totalTravelDistance',
                  'durationSeconds']])

y_reg = np.asarray(small_data['baseFare'])

X_class = np.asarray(small_data[['dateDelta', 'seatsRemaining', 'totalTravelDistance',
                  'durationSeconds', 'baseFare']])

y_class = np.asarray(small_data['destinationAirport'])

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=5110)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=5110)

## KNN Model

### Timing Code

In [6]:
%%timeit
#KNN model
with joblib.parallel_backend('dask'):
    knn_model = KNeighborsRegressor(n_neighbors=5)
    knn_model.fit(X_train_reg, y_train_reg)
    acc = knn_model.score(X_test_reg, y_test_reg)

TypeError: 'CancelledError' object is not iterable

### Accuracy Code

In [11]:
# Accuracy code
#KNN model
with joblib.parallel_backend('dask'):
    knn_model = KNeighborsRegressor(n_neighbors=5)
    knn_model.fit(X_train_reg, y_train_reg)
    acc = knn_model.score(X_test_reg, y_test_reg)

In [9]:
acc

0.6335177287143425

## K-Means Model

### Timing Code

In [18]:
%%timeit
#K-Means
with joblib.parallel_backend('dask'):
    kmeans_model = KMeans(n_clusters=2, random_state=5110, n_init="auto")
    kmeans_model.fit(X_train_reg)

5.07 s ± 341 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Accuracy Code for K-MEANS N/A

In [25]:
# Hehe

## Random Forest Model

### Timing Code

In [19]:
%%timeit
#Random Forest
with joblib.parallel_backend('dask'):
    rf_model = RandomForestRegressor(max_depth=2, random_state=5110)
    rf_model.fit(X_train_reg, y_train_reg)
   
    y_pred = rf_model.predict(X_test_reg)
    mse = mean_squared_error(y_test_reg, y_pred)

2min 34s ± 2.09 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Accuracy Code

In [28]:
#Random Forest
with joblib.parallel_backend('dask'):
    rf_model = RandomForestRegressor(max_depth=2, random_state=5110)
    rf_model.fit(X_train_reg, y_train_reg)
   
    y_pred = rf_model.predict(X_test_reg)
    mse = mean_squared_error(y_test_reg, y_pred)

In [29]:
np.sqrt(mse)

155.4640757066198

In [16]:
data.baseFare.describe()

count    500000.000000
mean        322.205953
std         214.183514
min          18.020000
25%         180.470000
50%         292.090000
75%         413.020000
max        2968.370000
Name: baseFare, dtype: float64

In [None]:
#SVM
with joblib.parallel_backend('dask'):
    svm_model = LinearSVC(random_state=5110, multi_class='ovr')
    svm_model.fit(X_train_class, y_train_class)
    acc = svm_model.score(X_test_class, y_test_class)

