In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
keys = ['Waterbody ID', 'sample.sampleDateTime']
path = os.path.abspath("Data") + os.path.sep
filename = path + "water_quality.csv"

In [3]:
# computes the matrix of the correlations between each variable.
def correlation_matrix(frame, variable_names):
    mat = np.zeros(shape=(len(variable_names), len(variable_names)))
    for i, var_y in zip(range(len(variable_names)), variable_names):
        for j, var_x in zip(range(len(variable_names)), variable_names):
            merged_df = split_on_variables_then_merge_data_frames_on_keys(frame=frame, keys=keys, var1=var_y, var2=var_x)
            # get the ingredients to produce the plot
            xs, ys = merged_df['result_x'], merged_df['result_y']
            corr_coeff = np.corrcoef(xs, ys)
            mat[i][j] = corr_coeff[0][1]
    return mat

In [4]:
# further process of the data frame
def split_on_variables_then_merge_data_frames_on_keys(frame, keys, var1, var2):
    selected_rows = frame[(frame['determinand.label'] == var1) | (frame['determinand.label'] == var2)]
    left_frame = selected_rows[selected_rows['determinand.label'] == var1]
    right_frame = selected_rows[selected_rows['determinand.label'] == var2]
    merged = pd.merge(left_frame, right_frame, on=keys)
    return merged

In [5]:
data = pd.read_csv(filename)
df = data[['determinand.label', 'result', 'determinand.unit.label', 'sample.sampleDateTime', 'Waterbody ID']]
variables = ['Ammonia(N)', 'N Oxidised', 'Nitrate-N', 'Nitrite-N', 'Orthophospht', 'pH', 'Temp Water']

corr_mat = correlation_matrix(df, variables)
print corr_mat

[[ 1.          0.19879147  0.18924927  0.57714796  0.73160366 -0.46918613
   0.00752045]
 [ 0.19879147  1.          0.9998613   0.61084661  0.35327486 -0.10879013
   0.08428674]
 [ 0.18924927  0.9998613   1.          0.59763714  0.348909   -0.1029298
   0.08450652]
 [ 0.57714796  0.61084661  0.59763714  1.          0.4276314  -0.3536216
   0.02205449]
 [ 0.73160366  0.35327486  0.348909    0.4276314   1.         -0.0412783
   0.03139208]
 [-0.46918613 -0.10879013 -0.1029298  -0.3536216  -0.0412783   1.
  -0.00233323]
 [ 0.00752045  0.08428674  0.08450652  0.02205449  0.03139208 -0.00233323
   1.        ]]


In [6]:
# create dataframe to visualize
correlation_frame = pd.DataFrame(data=corr_mat, columns=variables)
print correlation_frame

   Ammonia(N)  N Oxidised  Nitrate-N  Nitrite-N  Orthophospht        pH  \
0    1.000000    0.198791   0.189249   0.577148      0.731604 -0.469186   
1    0.198791    1.000000   0.999861   0.610847      0.353275 -0.108790   
2    0.189249    0.999861   1.000000   0.597637      0.348909 -0.102930   
3    0.577148    0.610847   0.597637   1.000000      0.427631 -0.353622   
4    0.731604    0.353275   0.348909   0.427631      1.000000 -0.041278   
5   -0.469186   -0.108790  -0.102930  -0.353622     -0.041278  1.000000   
6    0.007520    0.084287   0.084507   0.022054      0.031392 -0.002333   

   Temp Water  
0    0.007520  
1    0.084287  
2    0.084507  
3    0.022054  
4    0.031392  
5   -0.002333  
6    1.000000  
