# Data visualization
- [Time-related behaviour: Time series](#time-related-behaviour-time-series)
- [Time-related behaviour: Heat maps](#time-related-behaviour-heat-maps)
- [Distribution behaviour: Histograms](#distribution-behaviour-histograms)
- [Distribution behaviour: Box plots](#distribution-behaviour-box-plots)
- [Relational behaviour: Scatter plots](#relational-behaviour-scatter-plot)
- [Relational behaviour: Pair plots](#relational-behaviour-pair-plots)
- [Relational behaviour: Correlation plots](#relational-behaviour-correlation-plots)

**Dataset used**
- iof_data.csv

In [None]:
# Data links
data_url = {
    'iof_data_1min_csv' : "https://drive.google.com/uc?id=1_jYVXj7mt8Zzpjn8WGI111G-kWRTbfjU",
    'iof_data_1min_parq' : "https://drive.google.com/uc?id=1j5SS136UzbSPu8TqG9RRUMi6-wWF9dzq",
    'mixingTank' :  "https://drive.google.com/uc?id=1b5Qn5LIa6KAE03Tq4yRVdhTyUmZLxRjt",
    'moons' : "https://drive.google.com/uc?id=1a9zTkPEpuHGj6LzGzuLe-JSLg_4GJef4",
    'open_iof_20min' : "https://drive.google.com/uc?id=15lkhdBfWnjlpgpEx4T2XcRApKr-dmBb0",
    'open_iof_cleaned' : "https://drive.google.com/uc?id=1WVbJvYsGy-iKlsW4WaDZrKy_NhK2tJLW",
}

In [None]:
# Import approriate packages and assign with shorthand codes
import pandas as pd # pandas: for data ingestion and manipulation
import numpy as np # numpy: for mathematical and algebraic manipulation
import seaborn as sns # seaborn: for advanced plotting
import matplotlib.pyplot as plt # matplotlib.pyplot: for general plotting
import matplotlib.dates as mdates # matplotlib.dates: for adding dates to plots
from scipy.stats import zscore # function for scaled heatmap

In [None]:
# Import data set
fname = data_url['iof_data_1min_csv']
date_col = 'date'
df = pd.read_csv(fname,index_col=date_col,parse_dates=[date_col])

In [None]:
# Summary statistics
df.describe().T

# Time-related behaviour: Time series

In [None]:
# Time series plot example

# Create axis
fig, ax = plt.subplots(figsize=(10,5))
# - fig = figure object that contains canvas with axes inside
# - ax = axes object that represents single axes for plotting 

# Plot time series
# - Line arguments:
# alpha: transparancy i.e., 0 for fully transparent, 1 for fully opague
# ls / linestyle: i.e., '-' for solid, '--' for dashed, ':' for dotted
# lw / linewidth: e.g., 0.5 for thin, 2 for thick (continuous)
# color: line colour e.g. nome of colour 'blue', or RGB value [0,0,1] and other formats
# marker: marker style i.e., 'o', 'v', '^', 's'
# mec / markeredgecolor: colour specified as above
# mfc / markerfacecolor: colour specified as above
# ms / markersize: colour specified as above
# label: name of line to be used in legend
ax.plot(df['plant.flotation.sump01.amina.flow'],
        alpha=0.1, ls='none',
        marker='o',mec='blue',mfc='white',ms=1,
        label='Amina Flow [m3/h]')
ax.plot(df['plant.flotation.sump01.amina.flow'].rolling('12h').median(),
        ls = '-', lw=2, color=[0,0,0],
        label='Amina Flow [m3/h] - Rolling 1h median')
ax.set_xlabel('Time')
ax.set_ylabel('Amina Flow [m3/h]')
ax.set_title('Time series plot of amina flow')
ax.grid(True) # adds grid lines
ax.legend(loc='lower left') # adds legend at specific location

# Apply date formatting convention
# - define data format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%y-%m-%d %H'))
# - automatic rotation of tick labels to prevent overlapping
fig.autofmt_xdate()

# Save figure
fig.savefig('my_time_series.png',dpi=300) #dpi = dots per inch

Large process datasets (many observations/rows) often results in very crowded plots due to the large number of points, and the high level of noise. To create clearer plots, options include:
- Focus on a smaller period of interest (e.g., a few weeks), limiting the number of observations to around 10000
- Downsample the observations to a longer sampling frequency (e.g., from per minute values to hourly or Downsampled values)

In [None]:
# Time series plot example: smaller period of interest

# Create axis
fig, ax = plt.subplots(figsize=(10,5))

# Plot time series
ax.plot(df.loc[df.index.month==5,'plant.flotation.sump01.amina.flow'], # subset data to May
        alpha=0.2,
        ls = 'none',
        marker='o',mec='blue',mfc='white',ms=1,
        label='Amina Flow [m3/h]')
ax.set_xlabel('Time')
ax.set_ylabel('Amina Flow [m3/h]')
ax.set_title('Time series plot of amina flow: May 2017')
ax.grid(True) # adds grid lines
ax.legend(loc='lower left') # adds legend at specific location

# Apply date formatting convention
ax.xaxis.set_major_formatter(mdates.DateFormatter('%y-%m-%d %H'))
fig.autofmt_xdate()

In [None]:
# Downsample DataFrame 
downsample_df = df.resample('2h').median()
downsample_df.describe().T

In [None]:
# Time series plot example: downsampled data

# Create axis
fig, ax = plt.subplots(figsize=(10,5))

# Plot time series
ax.plot(downsample_df['plant.flotation.sump01.amina.flow'],
        ls='none',
        marker='o',mec='blue',mfc='white',ms=1,
        label='Amina Flow [m3/h] - Downsampled (median)')
# Annotations
ax.set_xlabel('Time')
ax.set_ylabel('Amina Flow [m3/h]')
ax.set_title('Time series plot of amina flow: Downsampled')
ax.grid(True) # adds grid lines
ax.legend(loc='lower left') # adds legend at specific location

# Apply date formatting convention
ax.xaxis.set_major_formatter(mdates.DateFormatter('%y-%m-%d %H'))
fig.autofmt_xdate()

In [None]:
# Time series plot: Two different y-axes
# Create axis
fig, ax = plt.subplots(figsize=(10,5))

# Apply date formatting convention
ax.xaxis.set_major_formatter(mdates.DateFormatter('%y-%m-%d %H'))
fig.autofmt_xdate()

# Plot time series: First set
ax.plot(downsample_df['plant.flotation.sump01.amina.flow'],color='black',alpha=0.2,label='Amina Flow [m3/h] - Downsampled Median')

# Plot time series: Second set
ax2 = ax.twinx() # creates a second axes 'ax2' that shares the same x-axis as 'ax'
ax2.plot(downsample_df['plant.flotation.sump01.starch.flow'],color='red',alpha=0.2,label='Starch Flow [m3/h] - Downsampled Median')

# Annotations
ax.set_xlabel('Time')
ax.set_ylabel('Amina Flow [m3/h]')
ax.set_title('Time series plot of amina and starch flows')
ax.legend(loc='lower left')

ax2.set_ylabel('Starch Flow [m/3h]')
ax2.legend(loc='upper right')
ax2.yaxis.label.set_color('red')
ax2.tick_params(axis='y', colors='red')

# Notice that you have to add legends separately for 'ax' and 'ax2'


In [None]:
## Time series plot (continued)
# Create axis
fig, ax = plt.subplots()
# Apply date formatting convention
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d %H:%M'))
fig.autofmt_xdate()
# Plot time series
ax.plot(downsample_df['plant.flotation.sump01.amina.flow'],color="black",alpha=0.2,label='Amina Flow [m3/h] - Downsampled Median')
# Add colouring for different time periods
p1start = '2017-03-11 01:00:00'
p1end = '2017-03-15 11:00:00'
p2start = '2017-04-01 12:00:00'
p2end = '2017-05-02 22:00:00'
p3start = '2017-08-01 00:30:00'
p3end = '2017-09-01 04:30:00'
plt.axvspan(xmin=p1start,xmax=p1end,alpha=0.2,color="blue",label="Period 1")
plt.axvspan(xmin=p2start,xmax=p2end,alpha=0.2,color="orange",label="Period 2")
plt.axvspan(xmin=p3start,xmax=p3end,alpha=0.2,color="green",label="Period 3")
plt.legend(loc='lower left')

In [None]:
## Variable group of interest
feed_variables = [
    'plant.flotation.sump01.starch.flow',
    'plant.flotation.sump01.amina.flow',
    'plant.flotation.sump01.discharge.flow',
    'plant.flotation.sump01.discharge.ph',
    'plant.flotation.sump01.discharge.density']

In [None]:
## Time series plots (continued)
# Create subplot axes with five rows and one column for plotting feed variables
fig, ax = plt.subplots(5,1,sharex=True,figsize = (10,20)) 
# sharex = True allows zooming to be shared between plots
# figsize (width, height)
# Apply date formating convention
ax[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
fig.autofmt_xdate()
for ind in range(0,len(feed_variables)):
    ax[ind].plot(downsample_df.loc[downsample_df.index.month==5,feed_variables[ind]],label=feed_variables[ind],color='blue',alpha=0.2,ls='none',marker='.')
    ax[ind].legend()


## Time-related behaviour: Heat maps

In [None]:
## Time series plots (continued)
# Heatmap of variables over time - scaled
fig, ax = plt.subplots(figsize=(10,5))

# Scaled values (transposed, for time on x-axis)
dfs = downsample_df.loc[downsample_df.index.month==5,feed_variables].apply(zscore).T

# Heatmap
im = ax.pcolormesh(dfs,cmap='rainbow')

# Variable names
# - location of midpoints
m = len(feed_variables)
y_mid_points = np.linspace(0.5,m-0.5,m)
ax.set_yticks(y_mid_points)
ax.set_yticklabels(feed_variables)
ax.set_ylabel('Variable')

# Sample indicator
ax.set_xlabel('Sample')

# Colorbar
cbar = plt.colorbar(im)
cbar.set_label('Scaled value')

## Distribution behaviour: Histograms

In [None]:
## Add categorical variable that indicates periods
df['Period'] = np.nan
df.loc[p1start:p1end,'Period'] = 'Period 1'
df.loc[p2start:p2end,'Period'] = 'Period 2'
df.loc[p3start:p3end,'Period'] = 'Period 3'
downsample_df['Period'] = np.nan
downsample_df.loc[p1start:p1end,'Period'] = 'Period 1'
downsample_df.loc[p2start:p2end,'Period'] = 'Period 2'
downsample_df.loc[p3start:p3end,'Period'] = 'Period 3'

In [None]:
## Histogram (seaborn)
sns.histplot(data = downsample_df, x = 'plant.flotation.sump01.amina.flow')

In [None]:
## Histograms with groups (seaborn)
sns.histplot(data=downsample_df,x='plant.flotation.sump01.amina.flow',hue="Period")


In [None]:
# Histogram with groups - time-based (seaborn)
sns.displot(
    downsample_df, x="plant.flotation.sump01.amina.flow", col=downsample_df.index.month, 
)

## Distribution behaviour: Box plots

In [None]:
## Box plot (seaborn)
fig = plt.figure(figsize=(5,10))
sns.boxplot(data=downsample_df,y='plant.flotation.sump01.amina.flow')

In [None]:
## Box plots per group (seaborn)
sns.boxplot(data=downsample_df,y='plant.flotation.sump01.amina.flow',x='Period')


In [None]:
## Box plots per group - horizontal (seaborn)
sns.boxplot(data=downsample_df, x='plant.flotation.sump01.amina.flow',y='Period')

## Relational behaviour: Scatter plot

In [None]:
## Scatter plots: Relationship between two variables
# jointplot also provides histograms of individual variables
sns.jointplot(x=downsample_df['plant.flotation.sump01.amina.flow'],y=downsample_df['plant.flotation.sump01.starch.flow'])


In [None]:
# Scatter plot with hue
sns.jointplot(data = downsample_df,x='plant.flotation.sump01.amina.flow',y='plant.flotation.sump01.starch.flow',hue='Period')


In [None]:
# Scatter plot with hue (indicating passage of time)
sample_number = downsample_df.index - downsample_df.index[0]
sns.scatterplot(
    data = downsample_df,
    x='plant.flotation.sump01.amina.flow',
    y='plant.flotation.sump01.starch.flow',
    hue=sample_number.days,
    legend="brief", # only gives legend for a few sequential points
    palette='Spectral' # see this link for colormap options https://matplotlib.org/stable/users/explain/colors/colormaps.html
)


In [None]:
## Scatter plots: Relationship between two variables
# A simple regression line can also be estimated and displayed
sns.jointplot(data = downsample_df,x='plant.flotation.sump01.amina.flow',y='plant.flotation.sump01.starch.flow',kind='reg')

## Relational behaviour: Pair plots

In [None]:
## Pairplots: Overview of all relationships between pairs of variables (continued)
# Kernel density estimate of distribution on diagonal instead
# Subset of variables specified with vars
sns.pairplot(data=downsample_df,vars=feed_variables,diag_kind='kde',corner=True)
plt.tight_layout()

In [None]:
## Pairplots: Overview of all relationships between pairs of variables (continued)
sns.pairplot(downsample_df,vars=feed_variables,hue='Period',corner=True)
plt.tight_layout()

## Relational behaviour: Correlation plots

In [None]:
## Correlation plots
# Calculate correlations (Pearson correlation)
corr = downsample_df[feed_variables].corr()
# Round numbers to 2 decimal places, to make it easier to read
corr = corr.apply(lambda x: round(x,2))
# Generate a mask for the upper triangle (i.e. to remove upper triangle and keep view simple)
mask = np.triu(np.ones_like(corr, dtype=np.bool_))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 10))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,mask=mask, square=True, cmap=plt.cm.bwr,annot=True,vmin=-1,vmax=1)
plt.title('Pearson Correlation')
plt.tight_layout()