In [60]:
import pandas as pd
import fastparquet

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings("ignore")

In [2]:
src_dest_df = pd.read_parquet('src_dest_df.parquet')
src_dest_df = src_dest_df.loc[src_dest_df.cloud_geo_iso1 != 'AWS.ap-east-1'] # removing this from training so that you can use it for testing/validation later 
src_dest_df.timestamp = pd.to_datetime(src_dest_df.timestamp)
src_dest_df.shape
src_dest_df.dtypes
src_dest_df.head()

sorted(src_dest_df.cloud_geo_iso1.unique())
sorted(src_dest_df.cloud_geo_iso2.unique())

aws_region_df = pd.read_pickle('region_geo_lookup_aws.pkl')
aws_region_df.shape
aws_region_df.dtypes
aws_region_df

aws_region_df.region_name.unique()

array(['AWS.us-east-1', 'AWS.us-east-2', 'AWS.us-west-1', 'AWS.us-west-2',
       'AWS.eu-west-1', 'AWS.eu-west-2', 'AWS.eu-west-3',
       'AWS.eu-central-1', 'AWS.sa-east-1', 'AWS.ap-southeast-1',
       'AWS.ap-southeast-2', 'AWS.ap-northeast-1', 'AWS.ap-northeast-2',
       'AWS.ap-south-1', 'AWS.ca-central-1', 'AWS.eu-north-1',
       'AWS.me-south-1', 'AWS.ap-east-1'], dtype=object)

In [3]:
l1 = (src_dest_df.cloud_geo_iso1.unique())

In [4]:
l2 = aws_region_df.region_name.values

In [5]:
for i in l2:
    if i not in l1:
        print (i)

AWS.me-south-1
AWS.ap-east-1


In [6]:
src_dest_df.to_csv("src.csv")

In [7]:
aws_region_df

Unnamed: 0,region_name,cloud_service_provider,city_name,lat,lon,timestamp
0,AWS.us-east-1,AWS,Virginia,38.13,-78.45,2020-03-30
1,AWS.us-east-2,AWS,Ohio,39.96,-83.0,2020-03-30
2,AWS.us-west-1,AWS,California,37.35,-121.96,2020-03-30
3,AWS.us-west-2,AWS,Oregon,46.15,-123.88,2020-03-30
4,AWS.eu-west-1,AWS,Ireland,53.0,-8.0,2020-03-30
5,AWS.eu-west-2,AWS,London,51.0,-0.1,2020-03-30
6,AWS.eu-west-3,AWS,Paris,48.86,2.35,2020-03-30
7,AWS.eu-central-1,AWS,Frankfurt,50.0,8.0,2020-03-30
8,AWS.sa-east-1,AWS,Sao Paulo,-23.34,-46.38,2020-03-30
9,AWS.ap-southeast-1,AWS,Singapore,1.37,103.8,2020-03-30


In [8]:
full_df = aws_region_df.drop("timestamp",axis=1).merge(aws_region_df.drop("timestamp",axis=1),on="cloud_service_provider")

In [9]:
full_df=full_df[full_df["region_name_x"]!=full_df["region_name_y"]]

In [10]:
full_df

Unnamed: 0,region_name_x,cloud_service_provider,city_name_x,lat_x,lon_x,region_name_y,city_name_y,lat_y,lon_y
1,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.us-east-2,Ohio,39.96,-83.0
2,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.us-west-1,California,37.35,-121.96
3,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.us-west-2,Oregon,46.15,-123.88
4,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-west-1,Ireland,53.0,-8.0
5,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-west-2,London,51.0,-0.1
6,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-west-3,Paris,48.86,2.35
7,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-central-1,Frankfurt,50.0,8.0
8,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.sa-east-1,Sao Paulo,-23.34,-46.38
9,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.ap-southeast-1,Singapore,1.37,103.8
10,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.ap-southeast-2,Sydney,-33.86,151.2


In [11]:
from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
R = 6373.0

def find_dist(row):

    lat1 = radians(row["lat_x"])
    lon1 = radians(row["lon_x"])
    lat2 = radians(row["lat_y"])
    lon2 = radians(row["lon_y"])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance


In [12]:
full_df["distance"] = full_df.apply(find_dist,axis=1)

In [13]:
full_df

Unnamed: 0,region_name_x,cloud_service_provider,city_name_x,lat_x,lon_x,region_name_y,city_name_y,lat_y,lon_y,distance
1,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.us-east-2,Ohio,39.96,-83.0,442.558152
2,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.us-west-1,California,37.35,-121.96,3792.425821
3,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.us-west-2,Oregon,46.15,-123.88,3796.95731
4,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-west-1,Ireland,53.0,-8.0,5488.427843
5,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-west-2,London,51.0,-0.1,6069.01336
6,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-west-3,Paris,48.86,2.35,6316.483932
7,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.eu-central-1,Frankfurt,50.0,8.0,6642.097305
8,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.sa-east-1,Sao Paulo,-23.34,-46.38,7613.055101
9,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.ap-southeast-1,Singapore,1.37,103.8,15621.720672
10,AWS.us-east-1,AWS,Virginia,38.13,-78.45,AWS.ap-southeast-2,Sydney,-33.86,151.2,15581.069536


In [14]:
src_dest_df[src_dest_df["cloud_geo_iso1"]=="AWS.ap-east-1"]
src_dest_df[src_dest_df["cloud_geo_iso2"]=="AWS.ap-east-1"]

Unnamed: 0,timestamp,cloud_geo_iso1,cloud_geo_iso2,latency_ms,packet_loss_percent


In [15]:
src_dest_df.columns

Index(['timestamp', 'cloud_geo_iso1', 'cloud_geo_iso2', 'latency_ms', 'packet_loss_percent'], dtype='object')

In [16]:
combined_df = pd.merge(src_dest_df,full_df[["region_name_x","region_name_y","distance"]],right_on = ["region_name_x","region_name_y"]\
        , left_on = ['cloud_geo_iso1', 'cloud_geo_iso2'])\
[['timestamp', 'cloud_geo_iso1', 'cloud_geo_iso2', 'latency_ms',"distance"]]

In [17]:
combined_df

Unnamed: 0,timestamp,cloud_geo_iso1,cloud_geo_iso2,latency_ms,distance
0,2020-01-01 11:00:00+00:00,AWS.eu-west-1,AWS.ap-northeast-2,124.20,9053.553379
1,2020-01-08 04:00:00+00:00,AWS.eu-west-1,AWS.ap-northeast-2,123.75,9053.553379
2,2020-01-02 14:00:00+00:00,AWS.eu-west-1,AWS.ap-northeast-2,123.80,9053.553379
3,2020-02-18 02:00:00+00:00,AWS.eu-west-1,AWS.ap-northeast-2,123.80,9053.553379
4,2020-02-26 07:00:00+00:00,AWS.eu-west-1,AWS.ap-northeast-2,123.00,9053.553379
...,...,...,...,...,...
495010,2020-03-02 22:00:00+00:00,AWS.ap-southeast-2,AWS.us-west-1,71.45,11963.407650
495011,2020-01-01 09:00:00+00:00,AWS.ap-southeast-2,AWS.us-west-1,70.70,11963.407650
495012,2020-01-07 10:00:00+00:00,AWS.ap-southeast-2,AWS.us-west-1,70.15,11963.407650
495013,2020-02-23 18:00:00+00:00,AWS.ap-southeast-2,AWS.us-west-1,70.55,11963.407650


In [18]:
sources  = combined_df["cloud_geo_iso1"].unique()

In [19]:
len(sources)

16

In [20]:
dest = combined_df["cloud_geo_iso2"].unique()

In [21]:
len(dest)

16

In [22]:
#combined_df["cloud_geo_iso1"] = pd.Categorical(combined_df["cloud_geo_iso1"], categories = list(src_dest_df.cloud_geo_iso1.unique())+ ["AWS.ap-east-1"])

#pd.get_dummies(car_type)

In [23]:
combined_df.describe()

Unnamed: 0,latency_ms,distance
count,495015.0,495015.0
mean,76.876143,8523.35124
std,39.447914,4379.297018
min,4.9,295.653308
25%,47.8,5885.545257
50%,74.6,8366.343668
75%,100.8,10897.673245
max,368.0,18566.678075


In [24]:
combined_df_new= pd.get_dummies(combined_df)

In [25]:
combined_df_new.columns

Index(['timestamp', 'latency_ms', 'distance', 'cloud_geo_iso1_AWS.ap-northeast-1', 'cloud_geo_iso1_AWS.ap-northeast-2', 'cloud_geo_iso1_AWS.ap-south-1', 'cloud_geo_iso1_AWS.ap-southeast-1', 'cloud_geo_iso1_AWS.ap-southeast-2', 'cloud_geo_iso1_AWS.ca-central-1', 'cloud_geo_iso1_AWS.eu-central-1', 'cloud_geo_iso1_AWS.eu-north-1', 'cloud_geo_iso1_AWS.eu-west-1', 'cloud_geo_iso1_AWS.eu-west-2', 'cloud_geo_iso1_AWS.eu-west-3', 'cloud_geo_iso1_AWS.sa-east-1', 'cloud_geo_iso1_AWS.us-east-1', 'cloud_geo_iso1_AWS.us-east-2', 'cloud_geo_iso1_AWS.us-west-1', 'cloud_geo_iso1_AWS.us-west-2', 'cloud_geo_iso2_AWS.ap-northeast-1', 'cloud_geo_iso2_AWS.ap-northeast-2', 'cloud_geo_iso2_AWS.ap-south-1', 'cloud_geo_iso2_AWS.ap-southeast-1', 'cloud_geo_iso2_AWS.ap-southeast-2', 'cloud_geo_iso2_AWS.ca-central-1', 'cloud_geo_iso2_AWS.eu-central-1', 'cloud_geo_iso2_AWS.eu-north-1', 'cloud_geo_iso2_AWS.eu-west-1', 'cloud_geo_iso2_AWS.eu-west-2', 'cloud_geo_iso2_AWS.eu-west-3', 'cloud_geo_iso2_AWS.sa-east-1',
  

In [26]:
combined_df_new['timestamp'] = pd.to_datetime(combined_df_new['timestamp'])
#combined_df_new['month']=combined_df_new['timestamp'].dt.month
#combined_df_new['day']=combined_df_new['timestamp'].dt.day
combined_df_new['hour']=combined_df_new['timestamp'].dt.hour
#combined_df_new['day_of_week']=combined_df_new['timestamp'].dt.dayofweek
combined_df_new['cloud_geo_iso1_AWS.ap-east-1'] = 0

In [27]:
combined_df_new.head()

Unnamed: 0,timestamp,latency_ms,distance,cloud_geo_iso1_AWS.ap-northeast-1,cloud_geo_iso1_AWS.ap-northeast-2,cloud_geo_iso1_AWS.ap-south-1,cloud_geo_iso1_AWS.ap-southeast-1,cloud_geo_iso1_AWS.ap-southeast-2,cloud_geo_iso1_AWS.ca-central-1,cloud_geo_iso1_AWS.eu-central-1,cloud_geo_iso1_AWS.eu-north-1,cloud_geo_iso1_AWS.eu-west-1,cloud_geo_iso1_AWS.eu-west-2,cloud_geo_iso1_AWS.eu-west-3,cloud_geo_iso1_AWS.sa-east-1,cloud_geo_iso1_AWS.us-east-1,cloud_geo_iso1_AWS.us-east-2,cloud_geo_iso1_AWS.us-west-1,cloud_geo_iso1_AWS.us-west-2,cloud_geo_iso2_AWS.ap-northeast-1,cloud_geo_iso2_AWS.ap-northeast-2,cloud_geo_iso2_AWS.ap-south-1,cloud_geo_iso2_AWS.ap-southeast-1,cloud_geo_iso2_AWS.ap-southeast-2,cloud_geo_iso2_AWS.ca-central-1,cloud_geo_iso2_AWS.eu-central-1,cloud_geo_iso2_AWS.eu-north-1,cloud_geo_iso2_AWS.eu-west-1,cloud_geo_iso2_AWS.eu-west-2,cloud_geo_iso2_AWS.eu-west-3,cloud_geo_iso2_AWS.sa-east-1,cloud_geo_iso2_AWS.us-east-1,cloud_geo_iso2_AWS.us-east-2,cloud_geo_iso2_AWS.us-west-1,cloud_geo_iso2_AWS.us-west-2,hour,day_of_week,cloud_geo_iso1_AWS.ap-east-1
0,2020-01-01 11:00:00+00:00,124.2,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,2,0
1,2020-01-08 04:00:00+00:00,123.75,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,0
2,2020-01-02 14:00:00+00:00,123.8,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,3,0
3,2020-02-18 02:00:00+00:00,123.8,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0
4,2020-02-26 07:00:00+00:00,123.0,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,2,0


### Training Data

In [28]:
X_train = combined_df_new.drop(["latency_ms","timestamp"],axis=1)
y_train = combined_df_new["latency_ms"]

In [29]:
X_train.head()

Unnamed: 0,distance,cloud_geo_iso1_AWS.ap-northeast-1,cloud_geo_iso1_AWS.ap-northeast-2,cloud_geo_iso1_AWS.ap-south-1,cloud_geo_iso1_AWS.ap-southeast-1,cloud_geo_iso1_AWS.ap-southeast-2,cloud_geo_iso1_AWS.ca-central-1,cloud_geo_iso1_AWS.eu-central-1,cloud_geo_iso1_AWS.eu-north-1,cloud_geo_iso1_AWS.eu-west-1,cloud_geo_iso1_AWS.eu-west-2,cloud_geo_iso1_AWS.eu-west-3,cloud_geo_iso1_AWS.sa-east-1,cloud_geo_iso1_AWS.us-east-1,cloud_geo_iso1_AWS.us-east-2,cloud_geo_iso1_AWS.us-west-1,cloud_geo_iso1_AWS.us-west-2,cloud_geo_iso2_AWS.ap-northeast-1,cloud_geo_iso2_AWS.ap-northeast-2,cloud_geo_iso2_AWS.ap-south-1,cloud_geo_iso2_AWS.ap-southeast-1,cloud_geo_iso2_AWS.ap-southeast-2,cloud_geo_iso2_AWS.ca-central-1,cloud_geo_iso2_AWS.eu-central-1,cloud_geo_iso2_AWS.eu-north-1,cloud_geo_iso2_AWS.eu-west-1,cloud_geo_iso2_AWS.eu-west-2,cloud_geo_iso2_AWS.eu-west-3,cloud_geo_iso2_AWS.sa-east-1,cloud_geo_iso2_AWS.us-east-1,cloud_geo_iso2_AWS.us-east-2,cloud_geo_iso2_AWS.us-west-1,cloud_geo_iso2_AWS.us-west-2,hour,day_of_week,cloud_geo_iso1_AWS.ap-east-1
0,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,2,0
1,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,0
2,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,3,0
3,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0
4,9053.553379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,2,0


In [59]:
X_train.hour.unique()

array([11,  4, 14,  2,  7,  3, 13,  0, 19, 17,  1, 22, 15,  9, 21,  5, 20,
        6, 12, 10,  8, 16, 18, 23], dtype=int64)

In [30]:
X_train.columns

Index(['distance', 'cloud_geo_iso1_AWS.ap-northeast-1', 'cloud_geo_iso1_AWS.ap-northeast-2', 'cloud_geo_iso1_AWS.ap-south-1', 'cloud_geo_iso1_AWS.ap-southeast-1', 'cloud_geo_iso1_AWS.ap-southeast-2', 'cloud_geo_iso1_AWS.ca-central-1', 'cloud_geo_iso1_AWS.eu-central-1', 'cloud_geo_iso1_AWS.eu-north-1', 'cloud_geo_iso1_AWS.eu-west-1', 'cloud_geo_iso1_AWS.eu-west-2', 'cloud_geo_iso1_AWS.eu-west-3', 'cloud_geo_iso1_AWS.sa-east-1', 'cloud_geo_iso1_AWS.us-east-1', 'cloud_geo_iso1_AWS.us-east-2', 'cloud_geo_iso1_AWS.us-west-1', 'cloud_geo_iso1_AWS.us-west-2', 'cloud_geo_iso2_AWS.ap-northeast-1', 'cloud_geo_iso2_AWS.ap-northeast-2', 'cloud_geo_iso2_AWS.ap-south-1', 'cloud_geo_iso2_AWS.ap-southeast-1', 'cloud_geo_iso2_AWS.ap-southeast-2', 'cloud_geo_iso2_AWS.ca-central-1', 'cloud_geo_iso2_AWS.eu-central-1', 'cloud_geo_iso2_AWS.eu-north-1', 'cloud_geo_iso2_AWS.eu-west-1', 'cloud_geo_iso2_AWS.eu-west-2', 'cloud_geo_iso2_AWS.eu-west-3', 'cloud_geo_iso2_AWS.sa-east-1',
       'cloud_geo_iso2_AWS.us

In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
import xgboost as xgb 
model = xgb.XGBRegressor()

In [32]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train[['distance']])
#X_test_scaled = scaler.transform(X_test[['distance']])

In [33]:
"""from sklearn.model_selection import GridSearchCV

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05], #so called `eta` value
  
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [100,200],
             'lambda':[0.1]}

xgb_grid = GridSearchCV(model,
                        parameters,
                        cv = 3,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X_train,
         y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)"""

"from sklearn.model_selection import GridSearchCV\n\nparameters = {'nthread':[4], #when use hyperthread, xgboost may become slower\n              'objective':['reg:linear'],\n              'learning_rate': [.03, 0.05], #so called `eta` value\n  \n              'silent': [1],\n              'subsample': [0.7],\n              'colsample_bytree': [0.7],\n              'n_estimators': [100,200],\n             'lambda':[0.1]}\n\nxgb_grid = GridSearchCV(model,\n                        parameters,\n                        cv = 3,\n                        n_jobs = 5,\n                        verbose=True)\n\nxgb_grid.fit(X_train,\n         y_train)\n\nprint(xgb_grid.best_score_)\nprint(xgb_grid.best_params_)"

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

model = xgb.XGBRegressor()

model.fit(X_train,y_train)
kfold = KFold(n_splits=5)
scores = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1)

In [35]:
scores

array([0.96936402, 0.82048043, 0.77877818, 0.88352234, 0.6827014 ])

### Creating test data

In [36]:
import itertools

In [37]:
hour = [i for i in range(1,25)]
day_of_week = [i for i in range(1,8)]

In [38]:
#group = [i for i in range(1,17)]
#day = [i for i in range(1,32)]

In [39]:
sources

array(['AWS.eu-west-1', 'AWS.eu-west-2', 'AWS.eu-west-3', 'AWS.sa-east-1',
       'AWS.us-east-1', 'AWS.us-east-2', 'AWS.us-west-1', 'AWS.us-west-2',
       'AWS.ap-south-1', 'AWS.eu-north-1', 'AWS.ca-central-1',
       'AWS.eu-central-1', 'AWS.ap-northeast-1', 'AWS.ap-northeast-2',
       'AWS.ap-southeast-1', 'AWS.ap-southeast-2'], dtype=object)

In [40]:
dest

array(['AWS.ap-northeast-2', 'AWS.ap-southeast-2', 'AWS.eu-west-2',
       'AWS.us-west-1', 'AWS.eu-north-1', 'AWS.eu-central-1',
       'AWS.us-east-1', 'AWS.ca-central-1', 'AWS.eu-west-3',
       'AWS.ap-south-1', 'AWS.us-east-2', 'AWS.ap-northeast-1',
       'AWS.sa-east-1', 'AWS.ap-southeast-1', 'AWS.us-west-2',
       'AWS.eu-west-1'], dtype=object)

In [41]:
a = [hour,dest]

In [42]:

final = pd.DataFrame(list(itertools.product(*a)),columns = ["hour","cloud_geo_iso2"])

In [43]:
len(final)

2688

In [44]:
final = pd.concat(
    [final, pd.DataFrame(
            [[0 for i in range(0,16)]], index= final.index,
            columns=['cloud_geo_iso1_AWS.ap-northeast-1',
       'cloud_geo_iso1_AWS.ap-northeast-2', 'cloud_geo_iso1_AWS.ap-south-1',
       'cloud_geo_iso1_AWS.ap-southeast-1',
       'cloud_geo_iso1_AWS.ap-southeast-2', 'cloud_geo_iso1_AWS.ca-central-1',
       'cloud_geo_iso1_AWS.eu-central-1', 'cloud_geo_iso1_AWS.eu-north-1',
       'cloud_geo_iso1_AWS.eu-west-1', 'cloud_geo_iso1_AWS.eu-west-2',
       'cloud_geo_iso1_AWS.eu-west-3', 'cloud_geo_iso1_AWS.sa-east-1',
       'cloud_geo_iso1_AWS.us-east-1', 'cloud_geo_iso1_AWS.us-east-2',
       'cloud_geo_iso1_AWS.us-west-1', 'cloud_geo_iso1_AWS.us-west-2'
       ]) ], axis=1
)

In [45]:
final.head()

Unnamed: 0,hour,cloud_geo_iso2,day_of_week,cloud_geo_iso1_AWS.ap-northeast-1,cloud_geo_iso1_AWS.ap-northeast-2,cloud_geo_iso1_AWS.ap-south-1,cloud_geo_iso1_AWS.ap-southeast-1,cloud_geo_iso1_AWS.ap-southeast-2,cloud_geo_iso1_AWS.ca-central-1,cloud_geo_iso1_AWS.eu-central-1,cloud_geo_iso1_AWS.eu-north-1,cloud_geo_iso1_AWS.eu-west-1,cloud_geo_iso1_AWS.eu-west-2,cloud_geo_iso1_AWS.eu-west-3,cloud_geo_iso1_AWS.sa-east-1,cloud_geo_iso1_AWS.us-east-1,cloud_geo_iso1_AWS.us-east-2,cloud_geo_iso1_AWS.us-west-1,cloud_geo_iso1_AWS.us-west-2
0,1,AWS.ap-northeast-2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,AWS.ap-northeast-2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,AWS.ap-northeast-2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,AWS.ap-northeast-2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,AWS.ap-northeast-2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
final = final.merge(full_df[full_df["region_name_x"]=="AWS.ap-east-1"][["distance","region_name_y"]],left_on="cloud_geo_iso2",right_on= "region_name_y" )

In [47]:
final.drop("region_name_y",axis=1,inplace=True)

In [48]:
final['cloud_geo_iso1_AWS.ap-east-1'] = 1

In [61]:
final

Unnamed: 0,distance,cloud_geo_iso1_AWS.ap-northeast-1,cloud_geo_iso1_AWS.ap-northeast-2,cloud_geo_iso1_AWS.ap-south-1,cloud_geo_iso1_AWS.ap-southeast-1,cloud_geo_iso1_AWS.ap-southeast-2,cloud_geo_iso1_AWS.ca-central-1,cloud_geo_iso1_AWS.eu-central-1,cloud_geo_iso1_AWS.eu-north-1,cloud_geo_iso1_AWS.eu-west-1,cloud_geo_iso1_AWS.eu-west-2,cloud_geo_iso1_AWS.eu-west-3,cloud_geo_iso1_AWS.sa-east-1,cloud_geo_iso1_AWS.us-east-1,cloud_geo_iso1_AWS.us-east-2,cloud_geo_iso1_AWS.us-west-1,cloud_geo_iso1_AWS.us-west-2,cloud_geo_iso2_AWS.ap-northeast-1,cloud_geo_iso2_AWS.ap-northeast-2,cloud_geo_iso2_AWS.ap-south-1,cloud_geo_iso2_AWS.ap-southeast-1,cloud_geo_iso2_AWS.ap-southeast-2,cloud_geo_iso2_AWS.ca-central-1,cloud_geo_iso2_AWS.eu-central-1,cloud_geo_iso2_AWS.eu-north-1,cloud_geo_iso2_AWS.eu-west-1,cloud_geo_iso2_AWS.eu-west-2,cloud_geo_iso2_AWS.eu-west-3,cloud_geo_iso2_AWS.sa-east-1,cloud_geo_iso2_AWS.us-east-1,cloud_geo_iso2_AWS.us-east-2,cloud_geo_iso2_AWS.us-west-1,cloud_geo_iso2_AWS.us-west-2,hour,day_of_week,cloud_geo_iso1_AWS.ap-east-1,Latency
0,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,24.371948
1,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,24.437647
2,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,24.351404
3,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,1,24.336605
4,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5,1,24.34063
5,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,1,24.337002
6,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,7,1,24.337002
7,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,24.393953
8,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,24.459652
9,2096.661238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,1,24.373409


In [50]:
final = pd.get_dummies(final,columns=["cloud_geo_iso2"])

In [51]:
final = final[X_train.columns]

In [52]:
final["Latency"]  = model.predict(final)

In [53]:
src_dest_df = pd.read_parquet('src_dest_df.parquet')

In [58]:
final[final["cloud_geo_iso2_AWS.ap-southeast-1"]==1]

Unnamed: 0,distance,cloud_geo_iso1_AWS.ap-northeast-1,cloud_geo_iso1_AWS.ap-northeast-2,cloud_geo_iso1_AWS.ap-south-1,cloud_geo_iso1_AWS.ap-southeast-1,cloud_geo_iso1_AWS.ap-southeast-2,cloud_geo_iso1_AWS.ca-central-1,cloud_geo_iso1_AWS.eu-central-1,cloud_geo_iso1_AWS.eu-north-1,cloud_geo_iso1_AWS.eu-west-1,cloud_geo_iso1_AWS.eu-west-2,cloud_geo_iso1_AWS.eu-west-3,cloud_geo_iso1_AWS.sa-east-1,cloud_geo_iso1_AWS.us-east-1,cloud_geo_iso1_AWS.us-east-2,cloud_geo_iso1_AWS.us-west-1,cloud_geo_iso1_AWS.us-west-2,cloud_geo_iso2_AWS.ap-northeast-1,cloud_geo_iso2_AWS.ap-northeast-2,cloud_geo_iso2_AWS.ap-south-1,cloud_geo_iso2_AWS.ap-southeast-1,cloud_geo_iso2_AWS.ap-southeast-2,cloud_geo_iso2_AWS.ca-central-1,cloud_geo_iso2_AWS.eu-central-1,cloud_geo_iso2_AWS.eu-north-1,cloud_geo_iso2_AWS.eu-west-1,cloud_geo_iso2_AWS.eu-west-2,cloud_geo_iso2_AWS.eu-west-3,cloud_geo_iso2_AWS.sa-east-1,cloud_geo_iso2_AWS.us-east-1,cloud_geo_iso2_AWS.us-east-2,cloud_geo_iso2_AWS.us-west-1,cloud_geo_iso2_AWS.us-west-2,hour,day_of_week,cloud_geo_iso1_AWS.ap-east-1,Latency
2184,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,28.840708
2185,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,28.906406
2186,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,28.823902
2187,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,4,1,28.834045
2188,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,5,1,28.84539
2189,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,6,1,28.841763
2190,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,7,1,28.841763
2191,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,28.840708
2192,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,28.906406
2193,2581.801742,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,3,1,28.823902


In [55]:
full_df[full_df["region_name_x"]=="AWS.ap-east-1"]

Unnamed: 0,region_name_x,cloud_service_provider,city_name_x,lat_x,lon_x,region_name_y,city_name_y,lat_y,lon_y,distance
306,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.us-east-1,Virginia,38.13,-78.45,13174.037253
307,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.us-east-2,Ohio,39.96,-83.0,12873.086462
308,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.us-west-1,California,37.35,-121.96,11164.285307
309,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.us-west-2,Oregon,46.15,-123.88,10430.93643
310,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.eu-west-1,Ireland,53.0,-8.0,9970.315975
311,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.eu-west-2,London,51.0,-0.1,9657.842993
312,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.eu-west-3,Paris,48.86,2.35,9632.490105
313,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.eu-central-1,Frankfurt,50.0,8.0,9212.927633
314,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.sa-east-1,Sao Paulo,-23.34,-46.38,18024.407402
315,AWS.ap-east-1,AWS,Asia Pacific (Hong Kong),22.279328,114.162813,AWS.ap-southeast-1,Singapore,1.37,103.8,2581.801742


In [56]:
src_dest_df.loc[src_dest_df.cloud_geo_iso1 == 'AWS.ap-east-1']

Unnamed: 0,timestamp,cloud_geo_iso1,cloud_geo_iso2,latency_ms,packet_loss_percent
0,2020-03-31 08:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,156.55,0.0
1,2020-03-30 17:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,154.6,0.0
2,2020-03-31 05:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,153.85,0.0
3,2020-03-31 09:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,155.6,0.0
4,2020-03-31 22:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,155.0,0.0
5,2020-03-30 16:00:00+00:00,AWS.ap-east-1,AWS.eu-west-1,115.9,0.0
6,2020-03-31 15:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,155.4,0.0
7,2020-03-31 03:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,155.1,0.0
8,2020-03-31 00:00:00+00:00,AWS.ap-east-1,AWS.us-west-2,154.05,0.0
9,2020-03-30 23:00:00+00:00,AWS.ap-east-1,AWS.eu-west-1,115.8,0.0
