# Data analysis of the 2nd experiment

We ran the upgrade pipeline of the two versions of the Classification Service application and collected the consumption of physical resources used in the Jenkins VM -- CPU and memory -- and the execution time of the image build and delivery stage in the Amazon ECR -- stage II . We determined these metrics to present the impact of packages on time and resource costs in the application update process.

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import scipy.stats as st

## Utils

In [2]:
def get_usage_list(data_frame, initial_time, column):
    start_time = datetime.strptime(initial_time, '%Y-%m-%d %H:%M:%S')

    usage_max = list()

    for _ in range(30):
        end_time = start_time + timedelta(minutes = 10)
        df_row = data_frame.loc[(data_frame['Time'].dt.to_pydatetime()>=start_time) &
                                (data_frame['Time'].dt.to_pydatetime()<end_time)]
        usage_max.append(df_row[column].max())
        start_time = end_time
    return usage_max

In [3]:
def get_statistics(usage_list):
    mean = np.mean(usage_list)

    lower, upper = st.t.interval(alpha=.95, df=len(usage_list)-1, loc=mean, scale=st.sem(usage_list))

    print(f"Samples: {usage_list}"
          f"\n\nMean: {mean} \n\nMedian: {np.median(usage_list)}"
          f"\n\nMode: {st.mode(usage_list)[0][0]}"
          f"\n\nMimimum: {min(usage_list)} \n\nMaximum: {max(usage_list)}"
          f"\n\nValues for confidence interval of 95%: \n\n"
          f"  - lower limit: {lower} \n\n  - upper limit: {upper}")

## TensorFlow

### CPU

In [4]:
cdata = pd.read_csv('tensorflow/cpu.csv')
cdata.info()
cdata.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    895 non-null    int64  
 1   User    895 non-null    float64
 2   System  895 non-null    float64
 3   IoWait  895 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 28.1 KB


Unnamed: 0,Time,User,System,IoWait
0,1654782540000,0.266684,0.125008,0.01665
1,1654782560000,0.183267,0.116625,0.0


In [5]:
cdata['Time'] = pd.to_datetime(cdata['Time'], unit='ms')
cdata.info()
cdata.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    895 non-null    datetime64[ns]
 1   User    895 non-null    float64       
 2   System  895 non-null    float64       
 3   IoWait  895 non-null    float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 28.1 KB


Unnamed: 0,Time,User,System,IoWait
0,2022-06-09 13:49:00,0.266684,0.125008,0.01665
1,2022-06-09 13:49:20,0.183267,0.116625,0.0


In [6]:
cpu_usage = get_usage_list(data_frame=cdata, initial_time='2022-06-09 13:49:00', column='User')
print("= CPU usage (%) =\n")
get_statistics(cpu_usage)

= CPU usage (%) =

Samples: [60.265800681147375, 61.52824097028779, 66.6638387721068, 60.487679092218364, 59.969987921219285, 68.3682977085205, 60.28643011587285, 60.03338605147368, 65.99368489798832, 58.63095371792033, 62.20331059025961, 57.81131436028259, 61.71881621843576, 68.68497581401762, 68.15947048460673, 58.357030292758914, 61.09690974794511, 64.85229427175803, 59.93722421481103, 68.89206648990717, 59.1159476260284, 68.08949323100892, 63.05933384748687, 61.804161597699874, 61.25020140049795, 67.25353747842341, 58.43682957316918, 58.13566119983628, 68.59641678895359, 66.87354239255698]

Mean: 62.88522791830664 

Median: 61.62352859436177

Mode: 57.81131436028259

Mimimum: 57.81131436028259 

Maximum: 68.89206648990717

Values for confidence interval of 95%: 

  - lower limit: 61.4533544566452 

  - upper limit: 64.31710137996808


### Memory

In [7]:
mdata = pd.read_csv('tensorflow/mem.csv')
mdata.info()
mdata.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    597 non-null    int64  
 1   used    597 non-null    float64
 2   total   597 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 14.1 KB


Unnamed: 0,Time,used,total
0,1654782540000,1790644000.0,4053430272
1,1654782570000,1791594000.0,4053430272


In [8]:
mdata['Time'] = pd.to_datetime(mdata['Time'], unit='ms')
mdata.info()
mdata.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    597 non-null    datetime64[ns]
 1   used    597 non-null    float64       
 2   total   597 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 14.1 KB


Unnamed: 0,Time,used,total
0,2022-06-09 13:49:00,1790644000.0,4053430272
1,2022-06-09 13:49:30,1791594000.0,4053430272


In [9]:
mem_usage = get_usage_list(data_frame=mdata, initial_time='2022-06-09 13:49:00', column='used')
mem_usage_gb = [x / 1000000000 for x in mem_usage]
print("= Memory usage (GB) =\n")
get_statistics(mem_usage_gb)

= Memory usage (GB) =

Samples: [1.8914645333333333, 1.904214016, 1.8985437866666668, 1.9129903786666667, 1.91236096, 1.915219968, 1.8964056746666667, 1.9112413866666667, 1.9181909333333333, 1.8967647573333333, 1.8953065813333332, 1.9183793493333332, 1.902493696, 1.8975320746666668, 1.9166303573333332, 1.9080942933333334, 1.907712, 1.9202717013333332, 1.927327744, 1.9141236053333333, 1.935228928, 1.9185104213333333, 1.907355648, 1.9221640533333333, 1.9280431786666667, 1.921753088, 1.9110884693333332, 1.9096070826666667, 1.934446592, 1.930752]

Mean: 1.9128072419555557 

Median: 1.9126756693333333

Mode: 1.8914645333333333

Mimimum: 1.8914645333333333 

Maximum: 1.935228928

Values for confidence interval of 95%: 

  - lower limit: 1.9083820102946165 

  - upper limit: 1.917232473616495


### Build and delivery (Stage II) time

In [10]:
build_time = list()
with open('tensorflow/build_time.txt') as f:
    for line in f:
        build_time.append(int(line.rstrip()))

In [11]:
mean_val = np.mean(build_time) 
 
lower, upper = st.t.interval(alpha=.95, df=len(build_time)-1, loc=mean_val, scale=st.sem(build_time))

print("= Stage II time (s) =\n")

print(f"Samples: {build_time}"
      f"\n\nMean: {mean_val} \n\nMedian: {np.median(build_time)}"
      f"\n\nMode: {st.mode(build_time)[0][0]}"
      f"\n\nMimimum: {min(build_time)} \n\nMaximum: {max(build_time)}"
      f"\n\nValues for confidence interval of 95%: \n\n"
      f"  - lower limit: {lower} \n\n  - upper limit: {upper}")

= Stage II time (s) =

Samples: [172, 177, 171, 178, 176, 170, 176, 177, 171, 175, 177, 176, 177, 171, 172, 176, 171, 172, 177, 171, 177, 172, 178, 177, 177, 172, 177, 175, 170, 171]

Mean: 174.3 

Median: 175.5

Mode: 177

Mimimum: 170 

Maximum: 178

Values for confidence interval of 95%: 

  - lower limit: 173.23862562915448 

  - upper limit: 175.36137437084554


## TensorFlow Lite

### CPU

In [12]:
cdata2 = pd.read_csv('tensorflow-lite/cpu.csv')
cdata2.info()
cdata2.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    895 non-null    int64  
 1   User    895 non-null    float64
 2   System  895 non-null    float64
 3   IoWait  895 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 28.1 KB


Unnamed: 0,Time,User,System,IoWait
0,1654800660000,0.275084,0.108396,0.025021
1,1654800680000,0.533709,0.249859,0.024975


In [13]:
cdata2['Time'] = pd.to_datetime(cdata2['Time'], unit='ms')
cdata2.info()
cdata2.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    895 non-null    datetime64[ns]
 1   User    895 non-null    float64       
 2   System  895 non-null    float64       
 3   IoWait  895 non-null    float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 28.1 KB


Unnamed: 0,Time,User,System,IoWait
0,2022-06-09 18:51:00,0.275084,0.108396,0.025021
1,2022-06-09 18:51:20,0.533709,0.249859,0.024975


In [14]:
cpu_usage = get_usage_list(data_frame=cdata2, initial_time='2022-06-09 18:51:00', column='User')
print("= CPU usage (%) =\n")
get_statistics(cpu_usage)

= CPU usage (%) =

Samples: [60.71724449791424, 66.27006076102047, 64.68484243914186, 60.94026716061575, 64.69974978095216, 65.85585889512826, 63.925034317584085, 63.75271334383459, 64.4925890084877, 64.80442569784759, 63.39051522286068, 63.48227180527895, 65.75009435422687, 63.08006089442739, 63.38263663489716, 64.76406954844981, 65.91994222176432, 65.78995831921632, 64.99019724173311, 64.05935972329829, 66.26432533519868, 67.13215684612562, 66.83678530752515, 65.72041874091165, 64.65057901007884, 66.07559187857969, 64.49953568290925, 66.10523877337891, 66.09916792644948, 64.57854021497359]

Mean: 64.75714105282702 

Median: 64.73190966470099

Mode: 60.71724449791424

Mimimum: 60.71724449791424 

Maximum: 67.13215684612562

Values for confidence interval of 95%: 

  - lower limit: 64.18982583101926 

  - upper limit: 65.32445627463478


### Memory

In [15]:
mdata2 = pd.read_csv('tensorflow-lite/mem.csv')
mdata2.info()
mdata2.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    597 non-null    int64  
 1   used    597 non-null    float64
 2   total   597 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 14.1 KB


Unnamed: 0,Time,used,total
0,1654800660000,1843084000.0,4053430272
1,1654800690000,1842512000.0,4053430272


In [16]:
mdata2['Time'] = pd.to_datetime(mdata2['Time'], unit='ms')
mdata2.info()
mdata2.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    597 non-null    datetime64[ns]
 1   used    597 non-null    float64       
 2   total   597 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 14.1 KB


Unnamed: 0,Time,used,total
0,2022-06-09 18:51:00,1843084000.0,4053430272
1,2022-06-09 18:51:30,1842512000.0,4053430272


In [17]:
mem_usage = get_usage_list(data_frame=mdata2, initial_time='2022-06-09 18:51:00', column='used')
mem_usage_gb = [x / 1000000000 for x in mem_usage]
print("= Memory usage (GB) =\n")
get_statistics(mem_usage_gb)

= Memory usage (GB) =

Samples: [1.8949884586666668, 1.9036214613333333, 1.8955537066666668, 1.9088288426666666, 1.9208628906666667, 1.9046427306666667, 1.894203392, 1.9216643413333332, 1.9131706026666668, 1.9149550933333332, 1.9259050666666668, 1.90984192, 1.906077696, 1.9015857493333332, 1.9093189973333333, 1.931309056, 1.9192941226666667, 1.923547136, 1.9213407573333332, 1.92382976, 1.9362474666666667, 1.9307410773333333, 1.9462826666666668, 1.943441408, 1.918328832, 1.9451139413333332, 1.9405073066666667, 1.935839232, 1.9433035093333333, 1.941655552]

Mean: 1.9208667591111113 

Median: 1.921101824

Mode: 1.894203392

Mimimum: 1.894203392 

Maximum: 1.9462826666666668

Values for confidence interval of 95%: 

  - lower limit: 1.9148819274496398 

  - upper limit: 1.9268515907725827


### Build and delivery (Stage II) time

In [18]:
build_time2 = list()
with open('tensorflow-lite/build_time.txt') as f:
    for line in f:
        build_time2.append(int(line.rstrip()))

In [19]:
mean_val = np.mean(build_time2) 
 
lower, upper = st.t.interval(alpha=.95, df=len(build_time2)-1, loc=mean_val, scale=st.sem(build_time2))

print("= Stage II time (s) =\n")

print(f"Samples: {build_time2}"
      f"\n\nMean: {mean_val} \n\nMedian: {np.median(build_time2)}"
      f"\n\nMode: {st.mode(build_time2)[0][0]}"
      f"\n\nMimimum: {min(build_time2)} \n\nMaximum: {max(build_time2)}"
      f"\n\nValues for confidence interval of 95%: \n\n"
      f"  - lower limit: {lower} \n\n  - upper limit: {upper}")

= Stage II time (s) =

Samples: [55, 55, 55, 56, 55, 55, 55, 56, 55, 56, 56, 55, 55, 56, 56, 55, 55, 56, 55, 56, 55, 54, 54, 55, 55, 55, 55, 55, 54, 56]

Mean: 55.2 

Median: 55.0

Mode: 55

Mimimum: 54 

Maximum: 56

Values for confidence interval of 95%: 

  - lower limit: 54.972126233972666 

  - upper limit: 55.42787376602734
