# **Library imports**

In [1]:
import pandas as pd
import glob

# **File imports**

In [2]:
# folder for txt files
folder = 'qws1_dataset/*.txt'

# list of files
files = glob.glob(folder)

# number of files
count = 1
for file in files:
    print(f'File number {count} - {file}')
    count += 1

File number 1 - qws1_dataset\qws1.txt


## File listing

- List the total number of files, convert them into Dataframes and append them in a python list;
- Verify the first few rows and data types

In [3]:
df_list = []

for file in files:
    try:
        df_list.append(pd.read_csv(file, sep=','))
    except Exception as e:
        print(f"Skipping {file}: {e}")

if df_list:
    df = pd.concat(df_list, ignore_index=True)
    print(df.shape)
else:
    print("No valid files")

df.head()

(364, 13)


Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF: Web Service Relevancy Function (%),Class: levels representing service offering qualities (1 through 4),Service Name,WSDL Address
0,45.0,83,27.2,50,97.4,89,91,43.0,58,100,1,DictionaryService,http://www.mindswap.org/2002/services/Dictiona...
1,71.75,100,14.6,88,85.5,78,80,64.42,86,93,1,MyService,http://mydispatch.com/myservice.asmx?wsdl
2,117.0,100,23.4,83,88.0,100,87,111.0,59,90,1,aba,http://www.quentinsagerconsulting.com/wsdl/aba...
3,70.0,100,5.4,83,79.3,100,75,63.0,91,90,1,AlexaWebSearch,http://wsearch.amazonaws.com/doc/2007-03-15/We...
4,105.2,100,18.2,80,92.2,78,84,104.6,91,90,1,ErrorMailer,http://www.errormail.net/EM/ErrorMailer.asmx?wsdl


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 13 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   Response Time                                                        364 non-null    float64
 1   Availability                                                         364 non-null    int64  
 2   Throughput                                                           364 non-null    float64
 3   Successability                                                       364 non-null    int64  
 4   Reliability                                                          364 non-null    float64
 5   Compliance                                                           364 non-null    int64  
 6   Best Practices                                                       364 non-null    int64  
 7   Latency 

# **DataFrame Exploration**

In [5]:
df.describe()

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF: Web Service Relevancy Function (%),Class: levels representing service offering qualities (1 through 4)
count,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0
mean,840.28303,84.752747,7.28489,64.436813,61.319505,83.678571,80.68956,763.480049,47.53022,66.653846,2.782967
std,2764.317553,20.451498,6.458114,21.172878,21.199886,8.771572,6.691123,2755.557745,36.400613,11.508577,0.981726
min,45.0,14.0,0.1,7.0,5.9,67.0,58.0,31.5,1.0,30.0,1.0
25%,136.785,74.75,2.175,50.0,49.9,78.0,77.0,121.875,9.0,59.0,2.0
50%,236.65,96.0,5.6,67.0,64.2,78.0,83.0,180.93,39.0,67.5,3.0
75%,480.0625,100.0,10.625,80.25,78.0,89.0,84.0,409.2725,89.0,74.25,4.0
max,30781.0,100.0,29.5,99.0,97.7,100.0,95.0,30781.0,97.0,100.0,4.0


### DataFrame Uniqueness

In [6]:
# Dictionary with column names: unique values
unique_dict = {col: df[col].unique().tolist() for col in df.columns}

# Convert to DataFrame
unique_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in unique_dict.items()]))
unique_df

Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF: Web Service Relevancy Function (%),Class: levels representing service offering qualities (1 through 4),Service Name,WSDL Address
0,45.00,83.0,27.2,50.0,97.4,89.0,91.0,43.00,58.0,100.0,1.0,DictionaryService,http://www.mindswap.org/2002/services/Dictiona...
1,71.75,100.0,14.6,88.0,85.5,78.0,80.0,64.42,86.0,93.0,2.0,MyService,http://mydispatch.com/myservice.asmx?wsdl
2,117.00,86.0,23.4,83.0,88.0,100.0,87.0,111.00,59.0,90.0,3.0,aba,http://www.quentinsagerconsulting.com/wsdl/aba...
3,70.00,98.0,5.4,80.0,79.3,67.0,75.0,63.00,91.0,89.0,4.0,AlexaWebSearch,http://wsearch.amazonaws.com/doc/2007-03-15/We...
4,105.20,79.0,18.2,87.0,92.2,88.0,84.0,104.60,88.0,88.0,,ErrorMailer,http://www.errormail.net/EM/ErrorMailer.asmx?wsdl
...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,,,,,,,,,,,,,http://seqhound.blueprint.org/wsdl/bind.wsdl
360,,,,,,,,,,,,,http://www.spreadshirt.net/services.php?wsdl
361,,,,,,,,,,,,,https://api.betfair.com/betex-api-public-ws/v2...
362,,,,,,,,,,,,,http://nbii-thesaurus.ornl.gov/ws/services/SKO...


### DataFrame missing values and duplicated values

In [7]:
# Missing values
missing = df.isnull().sum()

# Duplicate rows 
duplicates = df.duplicated().sum()

# Combine
df_nulls = missing.reset_index()
df_nulls.columns = ['Column', 'Missing Values']

df_nulls['Duplicated Rows'] = duplicates

df_nulls

Unnamed: 0,Column,Missing Values,Duplicated Rows
0,Response Time,0,0
1,Availability,0,0
2,Throughput,0,0
3,Successability,0,0
4,Reliability,0,0
5,Compliance,0,0
6,Best Practices,0,0
7,Latency,0,0
8,Documentation,0,0
9,WsRF: Web Service Relevancy Function (%),0,0


### Split data into numerical or categorical

In [8]:
df_numerical = df.select_dtypes(include='number')
df_categorical = df.select_dtypes(include='object')

## Outliers inspection

In [9]:
print(df_numerical.quantile(0.25))
print('-' * 79)
print(df_numerical.quantile(0.75))

Response Time                                                          136.785
Availability                                                            74.750
Throughput                                                               2.175
Successability                                                          50.000
Reliability                                                             49.900
Compliance                                                              78.000
Best Practices                                                          77.000
Latency                                                                121.875
Documentation                                                            9.000
WsRF: Web Service Relevancy Function (%)                                59.000
Class: levels representing service offering qualities (1 through 4)      2.000
Name: 0.25, dtype: float64
-------------------------------------------------------------------------------
Response Time           

In [10]:
q1 = df_numerical.quantile(0.75) - df_numerical.quantile(0.25)
print(q1)

Response Time                                                          343.2775
Availability                                                            25.2500
Throughput                                                               8.4500
Successability                                                          30.2500
Reliability                                                             28.1000
Compliance                                                              11.0000
Best Practices                                                           7.0000
Latency                                                                287.3975
Documentation                                                           80.0000
WsRF: Web Service Relevancy Function (%)                                15.2500
Class: levels representing service offering qualities (1 through 4)      2.0000
dtype: float64


#### A) DataFrame without outliers

In [11]:
# Using IQR method
Q1 = df_numerical.quantile(0.25)
Q3 = df_numerical.quantile(0.75)
IQR = Q3 - Q1

# Filter outliers
df_no_outliers = df[~((df_numerical < (Q1 - 1.5 * IQR)) | (df_numerical > (Q3 + 1.5 * IQR))).any(axis=1)]

print(df_no_outliers.shape)
df_no_outliers

(267, 13)


Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF: Web Service Relevancy Function (%),Class: levels representing service offering qualities (1 through 4),Service Name,WSDL Address
1,71.75,100,14.6,88,85.5,78,80,64.42,86,93,1,MyService,http://mydispatch.com/myservice.asmx?wsdl
3,70.00,100,5.4,83,79.3,100,75,63.00,91,90,1,AlexaWebSearch,http://wsearch.amazonaws.com/doc/2007-03-15/We...
4,105.20,100,18.2,80,92.2,78,84,104.60,91,90,1,ErrorMailer,http://www.errormail.net/EM/ErrorMailer.asmx?wsdl
6,99.20,100,13.7,80,76.3,78,83,62.40,89,89,1,States_x0020__x0026__x0020_Provinces,http://www.synapticdigital.com/webservice/publ...
7,108.20,100,16.8,80,90.7,78,77,108.00,94,88,1,XigniteRetirement,http://www.xignite.com/xRetirement.asmx?wsdl
...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,424.54,59,4.3,51,11.9,78,80,381.27,34,48,4,emSoapService,https://www.emsoap.net/smsservice.asmx?wsdl
346,450.25,56,6.0,33,40.2,78,88,440.50,7,47,4,SyndicationService,http://www.douglasp.com/SyndicationService.asm...
352,762.23,54,4.2,15,17.2,89,84,726.59,32,43,4,WSMetalMaker,http://metalmaker.net/metalmaker.asmx?wsdl
355,393.57,45,2.0,25,33.4,89,75,375.43,1,40,4,ESVService,http://www.gnpcb.org/esv/share/soap/index.php?...


#### B) DataFrame outliers only

In [12]:
outliers = df[((df_numerical < (Q1 - 1.5 * IQR)) | (df_numerical > (Q3 + 1.5 * IQR))).any(axis=1)]
print(outliers.shape)
outliers

(97, 13)


Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF: Web Service Relevancy Function (%),Class: levels representing service offering qualities (1 through 4),Service Name,WSDL Address
0,45.00,83,27.2,50,97.4,89,91,43.00,58,100,1,DictionaryService,http://www.mindswap.org/2002/services/Dictiona...
2,117.00,100,23.4,83,88.0,100,87,111.00,59,90,1,aba,http://www.quentinsagerconsulting.com/wsdl/aba...
5,224.00,100,24.6,83,80.0,100,87,223.00,88,90,1,getJoke,http://www.interpressfact.net/webservices/getJ...
12,129.00,100,29.5,83,95.5,100,84,111.00,11,87,1,ConvertCSharp2VBService,http://www.kamalpatel.net/ConvertCSharp2VBServ...
14,114.00,100,27.5,50,92.5,78,84,108.00,64,86,1,ssn,http://www.quentinsagerconsulting.com/wsdl/ssn...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2987.90,56,0.2,17,38.8,78,82,2987.90,8,39,4,EsfingeService,http://acdc.linguateca.pt/Esfinge/EsfingeWS.wsdl
360,502.00,24,3.4,9,21.8,78,85,499.28,12,35,4,sss_spreadshop_servicesService,http://www.spreadshirt.net/services.php?wsdl
361,256.31,14,1.0,10,13.0,100,75,237.62,1,34,4,BFServiceV2,https://api.betfair.com/betex-api-public-ws/v2...
362,6962.85,28,0.5,21,30.0,78,69,6915.08,5,32,4,SKOSThesaurusService,http://nbii-thesaurus.ornl.gov/ws/services/SKO...
