**Loading WSDREAM files**

* userlist.txt: contains the data pertaining to the users
* wslist.txt: contains the data pertaining to the services
* rtMatrix.txt: contains the data pertaining to the Response Time QoS observations
* tpMatrix.txt: contains the data pertaining to the Throuput QoS observations

In [24]:
import numpy as np
import pandas as pd

# Read the data and convert it into dataframes to facilitate exploration
usersData = pd.read_csv("userlist.txt", sep='\t', encoding='latin-1')
servicesData = pd.read_csv("wslist.txt", sep='\t', encoding='latin-1')
rtData = pd.read_csv("rtMatrix.txt", sep='\t', encoding='latin-1', header = None)
tpData = pd.read_csv("tpMatrix.txt", sep='\t', encoding='latin-1', header = None)

In [25]:
# Drop the last column from Rt and TP data as they don't have any information
rtData.drop(rtData.columns[-1], axis=1, inplace=True)
tpData.drop(tpData.columns[-1], axis=1, inplace=True)

In [31]:
# Calculate the fill value (the value to be used to fill missing data instead of -1 in the dataset)
# rt_fillValue = rtData.median().mean()
# tp_fillValue = tpData.median().mean()

# Implement the change
rtData.replace(-1,np.nan, inplace=True)
tpData.replace(-1,np.nan, inplace=True)

rtData.dropna(axis=1, how='all', inplace=True)
tpData.dropna(axis=1, how='all', inplace=True)

rtData.columns = range(rtData.shape[1])
tpData.columns = range(tpData.shape[1])

# fill with mean on each column
# rtData.fillna(rtData.mean(), inplace=True)
# tpData.fillna(tpData.mean(), inplace=True)

In [32]:
# Drop unnecessary features (columns) from the data describing the users and services
# We'll leave the following features Country, AS, Latitude and Longitude. Hence we'll drop "IP Address" and "IP_No." in both 
# datasets + "WSDL Address" and "Service Provider" from the service set.

usersData.drop(['IP Address', 'IP_No.'], axis=1, inplace=True)
servicesData.drop(['WSDL Address', 'Service Provider', 'IP Address','IP No.'], axis=1, inplace=True)

In [33]:
# Change the Country and AS fields to categorical data and then add category numbers as a field to the set

usersData[['Country','AS']] = usersData[['Country','AS']].astype('category')
servicesData[['Country','AS']] = servicesData[['Country','AS']].astype('category')

usersData['Country_catcod'] = usersData['Country'].cat.codes
usersData['AS_catcod'] = usersData['AS'].cat.codes
usersData.head()

servicesData['Country_catcod'] = servicesData['Country'].cat.codes
servicesData['AS_catcod'] = servicesData['AS'].cat.codes
servicesData.head()

Unnamed: 0,Service_ID,Country,AS,Latitude,Longitude,Country_catcod,AS_catcod
0,0,United States,AS3356 Level 3 Communications,38.0,-97.0,70,571
1,1,United States,AS3356 Level 3 Communications,38.0,-97.0,70,571
2,2,United States,AS3356 Level 3 Communications,38.0,-97.0,70,571
3,3,United States,AS3356 Level 3 Communications,38.0,-97.0,70,571
4,4,United States,AS3356 Level 3 Communications,38.0,-97.0,70,570


In [34]:
context_rtData = pd.concat([tpData, usersData[['Country_catcod','AS_catcod','Latitude','Longitude']]], axis=1, sort=False)
context_rtData.rename({'Country_catcod': 5803, 'AS_catcod': 5804, 'Latitude': 5805, 'Longitude': 5806}, axis=1, inplace=True)
context_rtData.head()

context_tpData = pd.concat([tpData, usersData[['Country_catcod','AS_catcod','Latitude','Longitude']]], axis=1, sort=False)
context_tpData.rename({'Country_catcod': 5803, 'AS_catcod': 5804, 'Latitude': 5805, 'Longitude': 5806}, axis=1, inplace=True)
context_tpData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5797,5798,5799,5800,5801,5802,5803,5804,5805,5806
0,0.334,17.543,25.316,9.049,9.009,5.692,4.415,189.045,5.181,4.622,...,1.381,0.74,0.616,0.762,0.596,0.59,29,115,38.0,-97.0
1,0.938,15.267,21.978,7.968,7.874,7.025,3.067,166.666,4.555,4.166,...,10.714,14.814,13.937,1.733,15.748,15.209,29,115,38.0464,-122.23
2,2.341,10.928,15.957,5.602,5.586,26.086,3.081,109.183,3.115,2.473,...,4.918,,,0.382,,,15,94,35.685,139.7514
3,2.886,17.699,25.751,9.09,9.132,8.72,2.614,134.93,5.39,5.434,...,16.949,23.952,28.169,18.433,23.529,23.121,29,28,40.4249,-86.9162
4,2.309,17.621,25.751,9.09,9.049,8.196,2.392,123.986,5.376,4.846,...,32.967,7.042,43.478,24.39,44.444,42.105,29,28,40.4249,-86.9162


In [35]:
# Write the resulting matrices to new csv files
context_rtData.to_csv(r'responseTimeWithContext.csv', float_format='%.3f', header=False, index=False)
context_tpData.to_csv(r'throughputWithContext.csv', float_format='%.3f', header=False, index=False)

# With headers
context_rtData.to_csv(r'HresponseTimeWithContext.csv', float_format='%.3f', header=True)
context_tpData.to_csv(r'HthroughputWithContext.csv', float_format='%.3f', header=True)