In [None]:
%matplotlib notebook 
import numpy as np
import scipy as sp
from scipy import stats
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
data2016 = pd.read_csv('C:\\Users\\Francis\\Google Drive\\Data\\___PACE\\STEM-Summer2017\\expeditions2016.csv', header=0)

In [None]:
data2017 = pd.read_csv('C:\\Users\\Francis\\Google Drive\\Data\\___PACE\\STEM-Summer2017\\expeditions2017.csv', header=0)

In [None]:
# look at the first five rows
data2016.head()

In [None]:
# Display the column names
data2016.columns

In [None]:
# look at the last five rows
data2016.tail()

In [None]:
# look at the first five rows
data2017.head()

In [None]:
# look at the last five rows
data2016.tail()

In [None]:
# transpose the data -- interchange rows and columns
data2016 = data2016.T
data2017 = data2017.T

In [None]:
# change the column names to the row labels
data2016.columns = data2016.iloc[0]
data2017.columns = data2017.iloc[0]

# since the first row of the transposed data contains the variable names and not data
# repalce the data frame with all but the first row
data2016 = data2016.iloc[1:]
data2017 = data2017.iloc[1:]

# take a look and make sure you got what you wanted
data2016.head()
data2017.head()

In [None]:
data2016.shape

In [None]:
data2017.shape

In [None]:
# you can print all the variable names
for i in range(len(data2016.columns)):
    print(data2016.columns[i])

In [None]:
# print all the variable names
for i in range(len(data2017.columns)):
    print(data2017.columns[i])

In [None]:
# you can explore specific subsets of the data
for i in range(0, 3):
    print(data2016.iloc[i]["Measurements (mm)"])

In [None]:
# Display the measuremts for all substrates for the second expedition
data2016.iloc[1]["Measurements (mm)"]

In [None]:
# You can select multiple columns by using an array as the index
data2016[["Measurements (mm)", "Total number of all live oysters"]]

In [None]:
# same
data2017[["Measurements (mm)", "Total number of all live oysters"]]

In [None]:
# Combine multiple dataframes
# axis = n tells concat if you want to combine rowise or columnwise
dataall = pd.concat([data2016, data2017], axis =1)

In [None]:
dataall.columns

In [None]:
dataall.head()

In [None]:
dataall.tail()

In [None]:
dataall["Measurements (mm)"]

In [None]:
# Extract the values for each substrate shell for one expedition
# Recall the expeditions are the rows so we use .iloc to pass the index location (row number)
# This will create an array of strings, each string contains the measurements
# vals is of type numpy.ndarray, and each element of vals is a str
vals = data2016.iloc[0]["Measurements (mm)"].values

In [None]:
# what is the dtype for vals
type(vals)

In [None]:
# what is the dtype for the first element of vals
type(vals[0])

In [None]:
vals

In [None]:
type(vals[0].split(","))

In [None]:
# we can't find the mean yet because this is a list of str 
vals[0].split(",")

In [None]:
# map let's us map the list of str to floats
# but we can't use this yet, so we make it a list of floats
# then make it a numpy.ndarray or a pandas.Series
n = np.array(list(map(float, vals[0].split(","))))

In [None]:
p = pd.Series(list(map(float, vals[0].split(","))))

In [None]:
# look at the data
n

In [None]:
p

In [None]:
n.mean()

In [None]:
p.mean()

In [None]:
# we can also convert from str to float using np.asfloat()
np.array(vals[0].split(",")).astype(float)

In [None]:
pd.Series(vals[0].split(",")).astype(float)

In [None]:
# we can summarize and display the means for each substrate within one expedition.
for v in range(len(vals)):
    print('Substrate Shell #' + str(v+1) + "\tmean =", '%.3f' % np.array(list(map(float, vals[v].split(",")))).mean(), sep='\t')

In [None]:
# This stacks all the measurements into one vector based on Water Color
wc = 'Water color\r\n(1=Light Blue,2=Dark Blue,3=Light Green,\r\n4=Dark Green,5=Light Brown,6=Dark Brown)'

v1 = np.array([])
v3 = np.array([])
v4 = np.array([])
v5 = np.array([])
v6 = np.array([])
for l in range(len(data2016["Measurements (mm)"].values)):
    # temp stores the observations for one expedition
    for t in range(10):
        if pd.notnull(data2016.iloc[l]['Measurements (mm)'].values[t]):
            w = np.array(data2016.iloc[l]["Measurements (mm)"].values[t].split(",")).astype(float)
            if data2016.iloc[l][wc] == '1':
                v1 = np.append(v1, w)
            elif data2016.iloc[l][wc] == '3':
                v3 = np.append(v3, w)
            elif data2016.iloc[l][wc] == '4':
                v4 = np.append(v4, w)
            elif data2016.iloc[l][wc] == '5':
                v5 = np.append(v5, w)
            else:
                v6 = np.append(v6, w)
        else:
            next

In [None]:
# Combine the measurements with water color into a new data frame
measures = np.concatenate((v1, v3, v4, v5, v6), axis=0)
measures = pd.DataFrame({'Measures': measures})

measures['WaterColor'] = np.zeros(len(measures['Measures']))

measures.loc[0:len(v1), 'WaterColor'] = 1
measures.loc[len(v1):len(v1)+len(v3), 'WaterColor'] = 3
measures.loc[len(v1)+len(v3):len(v1)+len(v3)+len(v4), 'WaterColor'] = 4
measures.loc[len(v1)+len(v3)+len(v4):len(v1)+len(v3)+len(v4)+len(v5), 'WaterColor'] = 5
measures.loc[len(v1)+len(v3)+len(v4)+len(v5):len(v1)+len(v3)+len(v4)+len(v4)+len(v5)+len(v6), 'WaterColor'] = 6

In [None]:
measures.head()

In [None]:
measures.groupby('WaterColor').mean()

In [None]:
measures.groupby('WaterColor').size()

In [None]:
measures['Measures'].plot.hist()

In [None]:
measures.groupby('WaterColor').mean().plot.bar()

In [None]:
measures.groupby('WaterColor').size().plot.bar()

In [None]:
# Split the measurements data into two equal parts 
# then plot them
# in reality you would notmally plot two different variables to observe any relationship
a = np.array(measures.loc[0:488]['Measures'])
b = np.array(measures.loc[489:978]['Measures'])
bob = pd.DataFrame({'a': a, 'b': b})
bob.plot.scatter(x = 'a', y = 'b')

In [None]:
# IN this example, I made up a second variable to plot against my measurements data
# I added random normal fluctuations to the measurements data
measures['Noise'] = measures['Measures'] + np.random.normal(0,4,978)
measures.plot.scatter(y = 'Noise', x = 'Measures')

In [None]:
measures['Measures'].plot()

In [None]:
rn = np.array([[np.random.normal(0,1,100)],[np.random.normal(0.5,0.8,100)]])
plt.scatter(rn[0], rn[1])
plt.title('Zero Correlation')
plt.show()

In [None]:
w = np.random.normal(0,1,100)
rn = np.array([w,w+2*np.random.normal(0,0.1,100)])
plt.scatter(rn[0], rn[1])
plt.title('Positive Correlation')
plt.show()

In [None]:
w = np.random.normal(0,1,100)
rn = np.array([w, -w+3*np.random.normal(0.5,0.1,100)])
plt.scatter(rn[0], rn[1])
plt.title('Negative Correlation')
plt.show()

In [None]:
measures['Measures'].plot.box()

In [None]:
np.corrcoef(measures['Measures'], measures['Noise'])

In [None]:
np.corrcoef(bob['a'], bob['b'])

In [None]:
rho, pstat = sp.stats.pearsonr(measures['Measures'], measures['Noise'])
print('rho = %.4f' % rho, 'p-val = %.4f' % pstat)

In [None]:
tstat, pval = sp.stats.ttest_ind(measures['Measures'], measures['Noise'])
print('t-stat = %.3f ' % tstat, 'p-value = %.3f' % pval)

In [None]:
tstat, pval = sp.stats.ttest_ind(v1, v5)
print('t-stat = %.3f ' % tstat, 'p-value = %.3f' % pval)

In [None]:
# If you want to test if there is a difference in the means from multiple groups
# use Analysis of Variance(ANOVA) 
fstat, pval = stats.f_oneway(v1, v3, v4, v5, v6)
print('ANOVA Results', 'fstat = %.3f' % fstat, 'p-value = %.3f' % pval, '\n', sep='\t')

# The null nypothesis for ANOVA is all means are equal
# If your test suggests rejecting the null you need to do a Tukey HSD comparison
# to see which pairs of means are different
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print(pairwise_tukeyhsd(measures['Measures'], measures['WaterColor']))

In [None]:
# fixing the pH data issue for out of bounds data
phs = pd.Series(['17.6', '8.2', '7.5'])
phsf = phs.astype(float)

In [None]:
phsf[phsf > 14.0] = np.nan
phsf