# The Impact of Ambient Temperature on Server Efficiency


Hypothesis: Server power consumption increases as temperature increases reducing server efficiency. As PUE values approach 1 an increasing portion of the Data centre's power is used in the server therfore there is likely to be a trade-off on operating temperature depending on cooling infrastructure and number of servers in the datacenter. 

------

Plan:

- Load in all of the SERT results avoiding any invalid ones
- Merge data as needed 
- Generate graphs showing power consumption against load and temperature
- Find a trade-off between operating temperature and number of servers. 


In [1]:
import numpy as np
import pandas as pd
import re
import os.path
from os import makedirs
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from parse_results import process_results_xml
import yaml

In [13]:

if os.path.isfile('settings.yaml'):
    with open('settings.yaml', 'r') as f:
        params = yaml.load(f)
else:
    params = {}
    
sert_results_dir = params.get('results_dir', 'sert_results')
bios_setting_file = params.get('test_settings', 'test_settings.csv')
cpu_metrics_dir = params.get('cpu_metrics_dir', 'cpu_data')

working_dir = params.get('temp_dir', 'temp_dir')
all_data_file = params.get('data_file', 'all_data.csv')
overwrite_data = params.get('overwrite_data', False)


print(params)
print(sert_results_dir, bios_setting_file, cpu_metrics_dir, working_dir, all_data_file, overwrite_data)
    


{'results_dir': 'C:\\Users\\s.clement\\Techbuyer Group\\Dan Burdett- Techbuyer Europe - Results - PowerEdge', 'cpu_metrics_dir': 'C:\\Users\\s.clement\\OneDrive - Techbuyer Group\\cpu_data', 'test_settings': 'test_settings.csv', 'temp_dir': 'data', 'data_file': 'all_data.csv', 'overwrite_data': False}
C:\Users\s.clement\Techbuyer Group\Dan Burdett- Techbuyer Europe - Results - PowerEdge test_settings.csv C:\Users\s.clement\OneDrive - Techbuyer Group\cpu_data data all_data.csv False


  params = yaml.load(f)


# Generate and load the data (Run Once)

In [12]:
sert_data = pd.DataFrame()

if not os.path.isdir(working_dir):
    os.makedirs(working_dir, exist_ok=True)


if not os.path.isfile(f'{working_dir}//{all_data_file}') or overwrite_data:

    metrics_data = pd.DataFrame()
    test_details = pd.DataFrame()
    scores = pd.DataFrame()
    invalid_results = []   # List of skipped results because they're invalid

    for f in glob.glob(f'{sert_results_dir}//**//results.xml', recursive=True):
        try:
            # Name of test directory -- sert-xxxx
            test_name = os.path.basename(os.path.dirname(f))

            if os.path.isfile(f'{sert_results_dir}//{test_name}//invalid.png'):
                invalid_results.append(test_name)
                continue
            
            # Load the SERT result
            metrics, score, env = process_results_xml(f)

            file_df = pd.DataFrame.from_records(metrics)
            # Remove calibration runs but record the calibration score against each loadlevel to calculate actual loadlevel
            calibrations = file_df.loc[file_df['loadlevel']=='calibration', ['worklet', 'score']]
            calibrations = calibrations.rename(columns={'score': 'calibration-score'})

            file_df = pd.merge(file_df.drop(index=calibrations.index), calibrations, how='left', on='worklet')
            file_df['actual-load'] = file_df['score'] / file_df['calibration-score']
            file_df['test-name'] = test_name

            score_df = pd.DataFrame.from_records(score)
            score_df['test-name'] = test_name

            metrics_data = metrics_data.append(file_df, ignore_index=True)
            test_details = test_details.append(pd.DataFrame.from_records(env, index=[test_name]))
            scores = scores.append(score_df, ignore_index=True)

        except Exception as e:
            print(f, ': FAILED TO LOAD -- ', e, type(e))
    
    if len(invalid_results) > 0:    
        print(f'Invalid results skipped: {invalid_results}')

    # Load external details for test
    if bios_setting_file != '' and os.path.isfile(bios_setting_file):
        settings = pd.read_csv(bios_setting_file, index_col=0)
    else:
        settings = pd.DataFrame()
    #settings.columns = ['location', 'bios']
    test_details = pd.merge(test_details, settings, left_index=True, right_index=True)
    # TODO: Add pressure lookup here for tunnel tests

    # Join metrics and test details to results.
    sert_data = pd.merge(metrics_data, scores, how='left', on=['test-name', 'worklet', 'loadlevel', 'workload', 'score', 'watts-avg'])
    sert_data = pd.merge(sert_data, test_details, left_on='test-name', right_index=True)
    sert_data.loc[sert_data['workload'] == 'Idle', 'actual-load'] = 0


    # Store generated data
    metrics_data.to_csv(f'{working_dir}//sert_metrics.csv', index=False)
    test_details.to_csv(f'{working_dir}//test_details.csv')
    scores.to_csv(f'{working_dir}//scores.csv', index=False)
    sert_data.to_csv(f'{working_dir}//{all_data_file}', index=False)
    
else:
    print(f'Loading SERT data from disk')
    sert_data = pd.read_csv(f'{working_dir}//{all_data_file}', parse_dates=['start', 'end'])
    
sert_data.describe()
        

Loading SERT data from disk


Unnamed: 0,score,watts-min,watts-max,watts-avg,temp-min,temp-max,temp-avg,calibration-score,actual-load,norm-score,ref-score,efficiency-score,dimm_size_mb,dimms,psu
count,3112.0,3112.0,3112.0,3112.0,3112.0,3112.0,3112.0,2736.0,2808.0,3040.0,3040.0,3040.0,3112.0,3112.0,3112.0
mean,320450.1,271.749987,282.537057,276.597605,22.910447,22.952241,22.932338,470526.8,0.609434,3.492761,90131.5035,12.44358,8192.0,8.0,750.0
std,602695.5,101.986656,104.603096,103.537904,2.051635,2.056999,2.053953,739611.4,0.296561,2.941399,136025.498985,8.793184,0.0,0.0,0.0
min,0.0,67.55,69.59,68.368525,20.0,20.0625,20.035714,192.223,0.0,0.471334,11.52,2.095409,8192.0,8.0,750.0
25%,5330.391,192.08,203.48,197.904355,21.1875,21.25,21.2375,30625.37,0.373776,1.615457,5437.5125,7.897912,8192.0,8.0,750.0
50%,46480.57,265.665,276.715,269.779194,22.5625,22.5625,22.5625,89699.57,0.50338,3.103832,15946.51,10.677639,8192.0,8.0,750.0
75%,244209.8,356.8075,371.3775,364.777328,24.0,24.0625,24.044643,359032.9,0.87695,4.391323,81279.8825,15.457764,8192.0,8.0,750.0
max,4142241.0,493.5,504.6,495.164516,29.375,29.375,29.375,2344183.0,1.048051,27.970126,354112.34,106.01377,8192.0,8.0,750.0


# Efficiency and power consumption measured by SERT

For the CPU workelts in particular, we can plot the benchmark load against the efficiency score achevied for each scenario. 

In [None]:
cpu = sert_data[(sert_data['workload'] == 'CPU')| (sert_data['workload'] == 'Idle')]
cpu['scenario'] = list(' - '.join(s) for s in zip(cpu['model'], cpu['cpu']))

sns.lmplot(x='actual-load', y='efficiency-score', hue='scenario', col='bios', 
           data=cpu[cpu['temp-avg'] < 23.5], order=2, truncate=True, scatter=True).fig.suptitle('CPU Worklet Efficiency Scores', y=1.1)
sns.lmplot(x='actual-load', y='watts-avg', hue='scenario', col='bios', 
           data=cpu[cpu['temp-avg'] < 23.5], order=2, truncate=True, scatter=True).fig.suptitle('CPU Worklet Power Consumption', y=1.1)

A cleaner plot without the individual data plotted for each sert run. 

In [None]:
sns.lmplot(x='actual-load', y='efficiency-score', hue='scenario', data=cpu[cpu['temp-avg'] < 22.5], order=2, truncate=True, scatter=False)
ax = plt.gca()
ax.set_title('CPU Worklet Efficiency Scores ( Test Temperature < 22.5C)')

The environmental conditions for the tests are as follows:

In [None]:
print('Mean\n', cpu.groupby('scenario')['temp-avg'].mean())
print('\nVariance\n', cpu.groupby('scenario')['temp-avg'].var())

# CPU Power

The CPU is usually considered the driver of most power consumption in the server (excluding any expansion cards). During the SERT tests we have also recorded low-level performance registers of the CPU like per-core frequency and also power consumption. 

Todo: 
- Determine relationship between chassis and CPU power consumption
    - Assume power = P_Idle + P_Chassis + P_CPU
    - IS P_Chassis a function of CPU power?
    
    
Read the CPU power data in and summarise for the tests

In [None]:
cpu_metrics = pd.DataFrame()

if not os.path.exists(f'{working_dir}//cpu_metrics.csv') or overwrite_data :
    
    for f in glob.glob(f'{cpu_metrics_dir}//**.csv', recursive=True):
        try:
            samples = pd.read_csv(f, skiprows=8, header=0, index_col=0, parse_dates=['Time'], encoding='cp1252')
            if not pd.api.types.is_datetime64_any_dtype(samples.index.dtype):
                # Final row contains "Session end:"
                samples.drop('Session end:', inplace=True)
                samples.index = pd.to_datetime(samples.index)

            cpu_metrics = cpu_metrics.append(samples)
        except:
            print(f'FAILED LOADING FILE: {f}')

    cpu_metrics.sort_index(inplace=True)
    cpu_metrics['total cpu power'] = cpu_metrics['CPU 0 Power'] + cpu_metrics['CPU 1 Power']
    
    cpu_metrics.to_csv(f'{working_dir}//cpu_metrics.csv')
    
else:
    cpu_metrics = pd.read_csv(f'{working_dir}//cpu_metrics.csv', index_col='Time', parse_dates=['Time'])
    

In [None]:
cpu_metrics['avg-temp'] = cpu_metrics.filter(regex='Temp').mean(axis=1, skipna=True)
cpu_metrics['avg-load'] = cpu_metrics.filter(regex='load').mean(axis=1, skipna=True)
cpu_metrics['avg-freq'] = cpu_metrics.filter(regex='speed').mean(axis=1, skipna=True)


#pd.to_datetime(cpu_metrics['Time']
sert_data['cpu-power'] = sert_data.apply(lambda row: cpu_metrics['total cpu power'][row['start'].tz_localize(None):row['end'].tz_localize(None)].mean(), axis=1)
sert_data['chassis-power'] = sert_data['watts-avg'] - sert_data['cpu-power']

sert_data['cpu-temp'] = sert_data.apply(lambda row: cpu_metrics['avg-temp'][row['start'].tz_localize(None):row['end'].tz_localize(None)].mean(), axis=1)
sert_data['cpu-load'] = sert_data.apply(lambda row: cpu_metrics['avg-load'][row['start'].tz_localize(None):row['end'].tz_localize(None)].mean(), axis=1)
sert_data['cpu-freq'] = sert_data.apply(lambda row: cpu_metrics['avg-freq'][row['start'].tz_localize(None):row['end'].tz_localize(None)].mean(), axis=1)



Looking again at the server efficiency scores, but now using the CPU utilsiation dat from the OS rather than the load data calculated by SERT. SERT load is a proportion of the total score/transactions acheived during the calibration runs. OS CPU utilisaiton is the proportion of time the CPU is busy performing operations. 

In [None]:
cpu_worklets = sert_data[(sert_data['workload'] == 'Idle') | (sert_data['workload'] == 'CPU')]
cpu_worklets['scenario'] = list(' - '.join(x) for x in zip(cpu_worklets['model'], cpu_worklets['cpu']))
sns.lmplot(x='cpu-load', y='efficiency-score', hue='scenario', col='bios', data=cpu_worklets[cpu_worklets['temp-avg'] < 22.5], order=2, truncate=True, scatter=True).fig.suptitle('Efficiency Scores ( Test Temperature < 22.5C)', y=1.1)


This is a significantly different relationship than that shown for the SERT load. 

Breaking down the performance per server and per worklet.

In [None]:
for server in cpu_worklets['scenario'].unique():
    sns.lmplot(x='cpu-load', y='efficiency-score', hue='worklet', col='bios', data=cpu_worklets[cpu_worklets['scenario'] == server], order=2, truncate=True, scatter=True).fig.suptitle(f'Efficiency Scores - {server}', y=1.1)


# CPU power consumption

In [None]:
plotdf = cpu_worklets.melt('actual-load', ['watts-avg', 'cpu-power', 'chassis-power'])

sns.lmplot(x='actual-load', y='value', hue='variable', data=plotdf, order=2)

In [None]:
for server in cpu_worklets['scenario'].unique():
    plotdf = cpu_worklets[cpu_worklets['scenario'] == server].melt(['actual-load', 'bios'], ['watts-avg', 'cpu-power', 'chassis-power'])

    sns.lmplot(x='actual-load', y='value', col='bios', hue='variable', data=plotdf, order=2).fig.suptitle(f'Power Breakdown - {server}', y=1.1)

In [None]:
for server in cpu_worklets['scenario'].unique():
    plotdf = cpu_worklets[cpu_worklets['scenario'] == server].melt(['cpu-load', 'bios'], ['watts-avg', 'cpu-power', 'chassis-power'])

    sns.lmplot(x='cpu-load', y='value', col='bios', hue='variable', data=plotdf, order=2).fig.suptitle(f'Power Breakdown - {server}', y=1.1)

In [None]:
cpu = sert_data[(sert_data['workload'] == 'CPU')| (sert_data['workload'] == 'Idle')]
cpu = cpu[cpu['model'] == 'PowerEdge R620']
#cpu = cpu[cpu['cpu'].str.contains('E5-2690 0')]


#sns.lmplot(x='cpu-power', y='chassis-power', hue='worklet', data=cpu, order=2)
sns.scatterplot(x='temp-avg', y='watts-avg', hue='cpu', data=cpu)

In [None]:
sns.lmplot(x='cpu-load', y='cpu-power', data=cpu, order=2)

In [None]:
ax = plt.scatter(x=cpu['cpu-temp'], y=cpu['cpu-power'], c=cpu['temp-avg'])
cbar = plt.colorbar()
cbar.ax.set_ylabel('Ambient Temp', rotation=90)
plt.ylabel('CPU Power')
plt.xlabel('CPU Temp')
plt.title('R620 E5-2690, Efficiency (DPAC) Mode')

In [None]:
ax = plt.scatter(x=cpu['cpu-temp'], y=cpu['cpu-power'], c=cpu['cpu-load'])
cbar = plt.colorbar()
cbar.ax.set_ylabel('CPU Load', rotation=90)
plt.ylabel('CPU Power')
plt.xlabel('CPU Temp')
plt.title('R620 E5-2690, Efficiency (DPAC) Mode')

In [None]:
ax = plt.scatter(x=cpu['cpu-freq'], y=cpu['cpu-power'], c=cpu['cpu-load'])
cbar = plt.colorbar()
cbar.ax.set_ylabel('CPU Load', rotation=90)
plt.ylabel('CPU Power')
plt.xlabel('CPU Freq')
plt.title('R620 E5-2690, Efficiency (DPAC) Mode')