In [1]:
import numpy as np
import pandas as pd
import os
import scipy.stats as sp

In [2]:
keys = ['Waterbody ID', 'sample.sampleDateTime']
path = os.path.abspath("Data") + os.path.sep
filename = path + "water_quality.csv"

In [3]:
# computes the matrix of the correlations between each variable.
# Pearson product-moment Correlation Coefficient
def Pearson_correlation_matrix(frame, variable_names):
    mat = np.zeros(shape=(len(variable_names), len(variable_names)))
    for i, var_y in zip(range(len(variable_names)), variable_names):
        for j, var_x in zip(range(len(variable_names)), variable_names):
            merged_df = split_on_variables_then_merge_data_frames_on_keys(frame=frame, keys=keys, var1=var_y, var2=var_x)
            # get the ingredients to produce the plot
            xs, ys = merged_df['result_x'], merged_df['result_y']
            corr_coeff = np.corrcoef(xs, ys) # Pearson product-moment correlation coefficient
            mat[i][j] = corr_coeff[0][1]
    return mat

In [4]:
# computes the matrix of the correlations between each variable.
#Spearman rank-order correlation coefficient and the p-value to test for non-correlation
def Spearman_correlation_matrix(frame, variable_names):
    mat = np.zeros(shape=(len(variable_names), len(variable_names)))
    for i, var_y in zip(range(len(variable_names)), variable_names):
        for j, var_x in zip(range(len(variable_names)), variable_names):
            merged_df = split_on_variables_then_merge_data_frames_on_keys(frame=frame, keys=keys, var1=var_y, var2=var_x)
            # get the ingredients to produce the plot
            xs, ys = merged_df['result_x'], merged_df['result_y']
            # rank-order correlation, p_value to test for non-correlation
            rho, p_value = sp.spearmanr(xs, ys)
            mat[i][j] = rho
    return mat

In [5]:
# further process of the data frame
def split_on_variables_then_merge_data_frames_on_keys(frame, keys, var1, var2):
    selected_rows = frame[(frame['determinand.label'] == var1) | (frame['determinand.label'] == var2)]
    left_frame = selected_rows[selected_rows['determinand.label'] == var1]
    right_frame = selected_rows[selected_rows['determinand.label'] == var2]
    merged = pd.merge(left_frame, right_frame, on=keys)
    return merged

In [6]:
data = pd.read_csv(filename)
df = data[['determinand.label', 'result', 'determinand.unit.label', 'sample.sampleDateTime', 'Waterbody ID']]
variables = ['Ammonia(N)', 'N Oxidised', 'Nitrate-N', 'Nitrite-N', 'Orthophospht', 'pH', 'Temp Water']

pearson_mat = Pearson_correlation_matrix(df, variables)
#print corr_mat

In [7]:
# create dataframe to visualize
round_pearson = [[round(el, 2) for el in values] for values in pearson_mat] # round to 2 decimal places
pearson_frame = pd.DataFrame(data=round_pearson, columns=variables)
print "pairwise Pearson product-moment Correlation Coefficient"
print pearson_frame

   Ammonia(N)  N Oxidised  Nitrate-N  Nitrite-N  Orthophospht    pH  \
0        1.00        0.20       0.19       0.58          0.73 -0.47   
1        0.20        1.00       1.00       0.61          0.35 -0.11   
2        0.19        1.00       1.00       0.60          0.35 -0.10   
3        0.58        0.61       0.60       1.00          0.43 -0.35   
4        0.73        0.35       0.35       0.43          1.00 -0.04   
5       -0.47       -0.11      -0.10      -0.35         -0.04  1.00   
6        0.01        0.08       0.08       0.02          0.03 -0.00   

   Temp Water  
0        0.01  
1        0.08  
2        0.08  
3        0.02  
4        0.03  
5       -0.00  
6        1.00  


In [8]:
# compute pairwise Spearman correlation matrix
spearman_mat = Spearman_correlation_matrix(df, variables)
round_spearman = [[round(el, 2) for el in values] for values in spearman_mat] # round to 2 decimal places
# create dataframe to visualize
spearman_frame = pd.DataFrame(data=round_spearman, columns=variables)
print "pairwise Spearman Correlation Coefficient"
print spearman_frame

   Ammonia(N)  N Oxidised  Nitrate-N  Nitrite-N  Orthophospht    pH  \
0        1.00        0.37       0.37       0.71          0.65 -0.50   
1        0.37        1.00       1.00       0.62          0.53 -0.07   
2        0.37        1.00       1.00       0.62          0.53 -0.07   
3        0.71        0.62       0.62       1.00          0.58 -0.29   
4        0.65        0.53       0.53       0.58          1.00 -0.22   
5       -0.50       -0.07      -0.07      -0.29         -0.22  1.00   
6        0.21       -0.02      -0.03       0.06          0.22  0.04   

   Temp Water  
0        0.21  
1       -0.02  
2       -0.03  
3        0.06  
4        0.22  
5        0.04  
6        1.00  
