# Part 2: Data Processing

This Jupyer notebook is designed to help you process the data you collected with the Raspberry Pi photometer.
You will need to upload the data you want to process using the main Jupyter hub page and adapt the code cells accordingly.

Cells that you will need to edit to input your own data are marked by comment lines with a `***` in them.
You can, of course, adapt the other cells if you wish, for example if you want to change the graphs or the names of output files.

In contrast to the workbooks in Part 1a, only brief explanations of what the code in each cell does are given with inline comments.
This is more like how a Jupyter notebook would be witten and used for data analysis in a research setting.
If it is not clear what a code cell does or how something works, you can ask a GTA to explain it to you.

In [None]:
# *** Input calibration curve using this cell. ***

# Prepare the calibration curve:
#   cal_conc = concentrations
#   cal_read = photometer readings

cal_conc = [ 1e-3, 8e-4, 5e-4, 2.5e-4, 1.25e-4, 6.25e-5, 1e-4, 1e-5 ]
cal_read = [ 0.066, 0.093, 0.161, 0.250, 0.277, 0.337, 0.319, 0.565]

In [None]:
# Use SciPy to perform a linear regression.

from scipy.stats import linregress

cal_fit = linregress(cal_conc, cal_read)

# linregress returns a fit result with various named properties:
#   cal_fit.slope = slope
#   cal_fit.intercept = intercept
#   cal_fit.rvalue = regression coefficient; +1 = perfect positive correlation, -1 = perfect negative correlation

print("Slope: {0:.2f}".format(cal_fit.slope))
print("Intercept: {0:.2f}".format(cal_fit.intercept))
print("R Value: {0:.3f}".format(cal_fit.rvalue))

In [None]:
# Maximum and minimum concentrations used in the calibration.
# (These will be needed later.)

conc_min = min(cal_conc)
conc_max = max(cal_conc)

In [None]:
# Set up Matplotlib with a custom "setup function".
# (Sets some useful default behaviour for journal-quality figures.)

import matplotlib as mpl

def SetupMatplotlib():
    font_size = 8
    line_width = 0.5

    mpl.rc('font', family = 'serif', size = font_size, serif = 'Times New Roman')

    mpl.rc('axes', linewidth = line_width)
    mpl.rc('lines', linewidth = line_width, markeredgewidth = line_width)

    mpl.rc('xtick', direction = 'in', top = True, bottom = True)
    mpl.rc('ytick', direction = 'in', left = True, right = True)
    
SetupMatplotlib()

In [None]:
%matplotlib inline

# Plot a calibration curve.
# This makes use of some of the advanced Matplotlib topics for controlling the style of plots.

import matplotlib.pyplot as plt

# New figure at 8.6 x 7 cm.

plt.figure(
    figsize = (8.6 / 2.54, 7 / 2.54)
    )

# Plot raw data as a scatter plot.

plt.scatter(
    cal_conc, cal_read, label = "Data",
    s = 36, marker = 'o', facecolor = 'none', edgecolor = 'b', linewidth = 0.5
    )

# Generate predicted readings for c = conc_min and c = conc_max.

fit_read_1 = cal_fit.slope * conc_min + cal_fit.intercept
fit_read_2 = cal_fit.slope * conc_max + cal_fit.intercept

# Plot a straight line showing the fit.

plt.plot(
    [conc_min, conc_max], [fit_read_1, fit_read_2],
    label = "Fit", color = 'k'
    )

# Add a legend.

plt.legend(loc = 'upper right', frameon = False)

# Use TeX notation ($...$) to include superscripts in the x-axis label.

plt.xlabel("[dye] [10$^{-4}$ mol dm$^{-3}$]")

# Set x-axis limits to [conc_min, conc_max].

plt.xlim(conc_min, conc_max)

# By default, Matplotlib draws e.g. 10^-4 as 0.0001, which is difficult to read.
# This behaviour can be changed by defining a custom "formatter" that multiplies the values by 10^4.

from matplotlib.ticker import FuncFormatter

def XFormat(value, pos):
    return "{0:.1f}".format(value * 1e4)

formatter = FuncFormatter(XFormat)

plt.gca().xaxis.set_major_formatter(formatter)

# y-axis label.

plt.ylabel("Photometer Reading")

# Layout function.

plt.tight_layout()

# Save to a PNG file (can be downloaded from the main Jupyter hub page).

plt.savefig("Calibration.png", dpi = 300)

In [None]:
# Function to read in data files produced by the RPi photometer.

import csv

def ReadData(file_path):
    # Lists to hold values.
    
    x_data, y_data = [], []
    
    with open(file_path, 'r') as input_reader:
        # Use the reader object from the csv module to parse the CSV file generated by the RPi measurement script.
        
        input_reader_csv = csv.reader(input_reader)
        
        # next() skips the first row (headers).
        
        next(input_reader_csv)
        
        # Loop over remaining rows using a for loop.
        # The csv.reader parses each row into a list until it reaches the end of the file.
        
        for row in input_reader_csv:
            # Use the float() function to convert the first and second elements in the row to floats and store in the x/y lists.
            
            x = float(row[0])
            y = float(row[1])
            
            x_data.append(x)
            y_data.append(y)
    
    # Return the x and y data as a tuple.
    
    return (x_data, y_data)

In [None]:
# *** Upload data file to the Jupyter hub and change the file name below. ***

x_data_raw, y_data_raw = ReadData("Part2-ExampleData.csv")

In [None]:
# *** If the file contains data from after the reaction was complete, the code at the bottom of this cell can be used to "trim". ***

# Perform some "pre-processing":
#   - Use the calibration curve to convert the photometer readings (y values) to [dye].
#   - Discard any data points for which the concentration is outside the range used in the calibration.

t_data = []
c_data = []

for i in range(0, len(x_data_raw)):
    x = x_data_raw[i]
    y = y_data_raw[i]
    
    # Calibration: reading = slope * conc + intercept.
    # Reverse: conc = (reading - intercept) / slope.
    
    conc = (y - cal_fit.intercept) / cal_fit.slope
    
    if conc >= conc_min and conc <= conc_max:
        t_data.append(x)
        c_data.append(conc)

# If required, data collected after a certain time can be discarded.
# (Uncomment the code below to do this.)

#t_max = 300 # 300 s = 5 mins

#t_data_new = []
#c_data_new = []

#for i in range(0, len(t_data)):
#    t = t_data[i]
#    c = c_data[i]
    
#    if t <= t_max:
#        t_data_new.append(t)
#        c_data_new.append(c)

#t_data = t_data_new
#c_data = c_data_new

In [None]:
# Convert the data to NumPy arrays -- makes them easier to work with.

import numpy as np

t_data = np.array(t_data, dtype = np.float64)
c_data = np.array(c_data, dtype = np.float64)

In [None]:
# Zeroth order: [dye] vs. t should be linear.
# If it is, then slope = -k_obs and intercept = [dye] (t = 0).

zeroth_order_fit = linregress(t_data, c_data)

# Used the "e" format code for the slope and intercept as these are likely to be small numbers.

print("Slope: {0:.2e}".format(zeroth_order_fit.slope))
print("Intercept: {0:.2e}".format(zeroth_order_fit.intercept))
print("R Value: {0:.3f}".format(zeroth_order_fit.rvalue))

In [None]:
%matplotlib inline

# New figure.

plt.figure(
    figsize = (8.6 / 2.54, 7 / 2.54)
    )

# Plot data.

plt.plot(t_data, c_data, label = "Data", color = 'b')

# Min/max times measured.
# (Using t_data.min()/t_data.max() as t_data is a NumPy array not a Python list.)

t_min = t_data.min()
t_max = t_data.max()

# Overlay fit.

fit_c_1 = zeroth_order_fit.slope * t_min + zeroth_order_fit.intercept
fit_c_2 = zeroth_order_fit.slope * t_max + zeroth_order_fit.intercept

plt.plot(
    [t_min, t_max], [fit_c_1, fit_c_2],
    label = "Fit", color = 'k'
    )

# Legend.

plt.legend(loc = 'upper right', frameon = False)

# x-axis.

plt.xlim(t_min, t_max)
plt.xlabel("$t$ [s]")

# Since y is in concentration units, we should apply a more sensible formatting to the labels.

def YFormat(value, pos):
    return "{0:.1f}".format(value * 1e4)

formatter = FuncFormatter(YFormat)

plt.gca().yaxis.set_major_formatter(formatter)

plt.ylabel("[dye] [10$^{-4}$ mol dm$^{-3}$]")

# Layout function.

plt.tight_layout()

# Save to a PNG file (download if required).

plt.savefig("Analysis-ZerothOrder.png", dpi = 300)

In [None]:
# First order: ln([dye]) vs. t should be linear.
# If it is, then slope = -k_obs and intercept = ln([dye]) (t = 0).

x_data = t_data
y_data = np.log(c_data)

first_order_fit = linregress(x_data, y_data)

# "e" format code for the slope as likely to be a small number.

print("Slope: {0:.2e}".format(first_order_fit.slope))
print("Intercept: {0:.2f}".format(first_order_fit.intercept))
print("R Value: {0:.3f}".format(first_order_fit.rvalue))

In [None]:
%matplotlib inline

plt.figure(
    figsize = (8.6 / 2.54, 7 / 2.54)
    )

# Plot data.

plt.plot(x_data, y_data, label = "Data", color = 'b')

# Min/max times measured.

t_min = t_data.min()
t_max = t_data.max()

# Overlay fit.

fit_c_1 = first_order_fit.slope * t_min + first_order_fit.intercept
fit_c_2 = first_order_fit.slope * t_max + first_order_fit.intercept

plt.plot(
    [t_min, t_max], [fit_c_1, fit_c_2],
    label = "Fit", color = 'k'
    )

# Legend.

plt.legend(loc = 'upper right', frameon = False)

# x-axis.

plt.xlim(t_min, t_max)
plt.xlabel("$t$ [s]")

# y-axis.

plt.ylabel("ln[dye]")

# Layout function.

plt.tight_layout()

# Save to a PNG file (download if required).

plt.savefig("Analysis-FirstOrder.png", dpi = 300)

In [None]:
# Second order: 1/[dye] vs. t should be linear.
# If it is, then slope = k_obs and intercept = 1/[dye] (t = 0).

x_data = t_data
y_data = 1 / c_data

second_order_fit = linregress(x_data, y_data)

print("Slope: {0:.2f}".format(second_order_fit.slope))
print("Intercept: {0:.2f}".format(second_order_fit.intercept))
print("R Value: {0:.3f}".format(second_order_fit.rvalue))

In [None]:
%matplotlib inline

plt.figure(
    figsize = (8.6 / 2.54, 7 / 2.54)
    )

# Plot data.

plt.plot(x_data, y_data, label = "Data", color = 'b')

# Min/max times.

t_min = t_data.min()
t_max = t_data.max()

# Overlay fit.

fit_c_1 = second_order_fit.slope * t_min + second_order_fit.intercept
fit_c_2 = second_order_fit.slope * t_max + second_order_fit.intercept

plt.plot(
    [t_min, t_max], [fit_c_1, fit_c_2],
    label = "Fit", color = 'k'
    )

# Legend.

plt.legend(loc = 'upper left', frameon = False)

# x-axis.

plt.xlim(t_min, t_max)
plt.xlabel("$t$ [s]")

# Since c ~ 10^-4 mol dm^-3, 1/c ~ 10^4 mol^-1 dm^3.
# We Should therefore adjust the y-axis label formatting as for the concentration.

def YFormat(value, pos):
    return "{0:.1f}".format(value * 1e-4)

formatter = FuncFormatter(YFormat)

plt.gca().yaxis.set_major_formatter(formatter)

plt.ylabel("1/[dye] [mol$^{-1}$ dm$^{3}$]")

# Layout function.

plt.tight_layout()

# Save to a PNG file (again, download if you wish).

plt.savefig("Analysis-SecondOrder.png", dpi = 300)