# TEST new script-functions on old data
# Check if the old data creates the same plots as seen in Visualize_Final_Reg... (IT SHOULD)
# this would indicate that the new data is being handled correctly and that the tight fit of the
# chlamy is actually correct...


In [1]:
import numpy as np
import pandas as pd
import os
import scipy

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.gridspec as gridspec

import seaborn as sns
sns.set(style='whitegrid')

from os.path import join as pjoin

import itertools as it

%config InteractiveShell.ast_node_interactivity='all'
%config InlineBackend.figure_format = 'svg'

In [8]:
def get_regressorand(df, regressorand='560'):
    """
    The following function returns the values for the independant variable (i.e. regressor, X) OR
    the dependand variable (i.e. regressand, Y) for simple linear regression (Y ~ mX + b) models.
    These regressors are all wavelengths NOT in the following list [680, 720, 730, 750] as these 
    wavelengths serve as the regressands in the models to be constructed. These regressands are
    chosen specifically because the photobioreactor used in the lab collects data at 680 & 720 only
    AND, previously, experiements done in the Tecan reactor collected data in using 680, 730, 750 
    wavelenghts.
    
    For any specific wavelegnth the function returns a tuple where the first element contains the
    values of the 12 column of a 96 well plate as measure by the tecan: This 12th columns contains
    only bacteria. The second element in the tuple contains the values for the row H of a 96 well
    plate, this row contains only chlamy in its wells
    """
    # name of the column in the dataframe    
    dfcol = regressorand
    
    rows = df['row'].unique()
    cols =  df['col'].unique()
    
    col12= []
    for row in rows:
        col12.append(df.loc[(df['row']==row) & (df['col']==12), dfcol][0])
        
    rowH = []
    for col in cols:
        rowH.append(df.loc[(df['row']=='H') & (df['col']==col), dfcol][0])
    
    # measured values at 12th and H'th row respectively
    bacter, chlamy = np.array(col12), np.array(rowH)
    
    return bacter, chlamy


    
def get_regression_coefficients(df, FIRST_REGRESSAND='A680', second_regressand='A720', regressor='560'):
    """
    The following function returns the coefficients from four simple linear regression models, these
    are then used to predict values in the rest of the wells---wells not 12th col or H'th row. FIRST
    REGRESSAND should remain 'A680' and so too should second_regressand remain 'A720' as these are 
    the wavelengths collected by the photobioreactor. The second_regressand can be changed depending
    if you need a model to fit Tecan Spectrophotometry data. However, any two regressands can be
    used to build a model which regresses back to any one regressor---'560' is chosen because it
    seems to have the least amount of correlation to any other wavelength per pearson correlation.
    """
    
    # these are measured values found in the dataframe
    # col12 values, rowH values
    yB1, yC1 = get_regressorand(df, regressorand=FIRST_REGRESSAND) 
    yB2, yC2 = get_regressorand(df, regressorand=second_regressand)
    xB, xC   = get_regressorand(df, regressorand=regressor)
    
    lin_reg_objs = [LinearRegression() for i in range(4)]
    
    kB1 = lin_reg_objs[0].fit(X=xB.reshape(-1,1), y=yB1)
    kC1 = lin_reg_objs[1].fit(X=xC.reshape(-1,1), y=yC1)
    
    kB2 = lin_reg_objs[2].fit(X=xB.reshape(-1,1), y=yB2)
    kC2 = lin_reg_objs[3].fit(X=xC.reshape(-1,1), y=yC2)
    
    return kB1.coef_, kC1.coef_, kB2.coef_, kC2.coef_
    
    
    
def get_coeff_dict_for_all_regressors(df, FIRST_REGRESSAND='A680', second_regressand='A720'):
    """
    This function calls the get_regression_coefficient function and builds models which regress back
    to ALL the regressor wavelength 500 to 750. Pay careful attention to which regressands were used.
    Ensure to name your dictionary accordingly, lest you forget what your models are using.
    """
    
    
    regressors = ['500', '510', '520',
              '530', '540', '550', '560',
              '570', '580', '590', '600',
              '610', '620', '630', '640',
              '650', '660', '670', '680',
              '690', '700', '710', '720',
               '730', '740', '750']
    
    # stores the coefficients which regress back to each regressor
    coefficients = {}
    
    for regr in regressors:
        coefficients[regr] = get_regression_coefficients(df, FIRST_REGRESSAND, second_regressand, regr)
                
    return coefficients

        
        
def get_predictions(df, coef_dict, regressor='560', first_regressand=None, second_regressand=None):
    """
    This function uses the coefficients obtained from the simple linear and Beer-Lambert's law
    to make predictions. Each prediction is composed of either the [bacterial] or [chlamy] 
    fraction of the regressor signal. Ensure that you're using the coefficient dictionary for 
    the proper regressand wavelengths. If you use wavelengths other than 'A680' and A720, 
    ensure that you type those in the predictions.
    """
    
    
    kB1, kC1, kB2, kC2 = coef_dict[regressor]
    
    tdf = df.copy()

    yT1 = first_regressand
    yT2 = second_regressand
    
    
    tdf['B_p'+regressor] = (kC2*tdf[yT1] - kC1*tdf[yT2]) / (-kC1*kB2 + kC2*kB1)
    tdf['C_p'+regressor] = (kB2*tdf[yT1] - kB1*tdf[yT2]) / (-kB1*kC2 + kB2*kC1)
    
    
    return tdf[['B_p'+regressor, 'C_p'+regressor]].copy()
    

def set_B_and_C(df, column):
    
    """This helper function sets the concentraiton values B and
    C so that plotting function can plot the right values"""
    
    df = df.copy()
    
    rows = df['row'].unique()
    cols =  df['col'].unique()
    
    for row in rows:
        value = df.loc[(df['row']==row) & (df['col']==12), column][0]
        df.loc[df['row']==row, 'B'] = value

    for col in cols:
        value = df.loc[(df['row']=='H') & (df['col']==col), column][0]
        df.loc[df['col']==col, 'C'] = value
        
    return df



def get_B_and_C(df, wav):
    
    """This helper function returns the concentraiton values B and
    C so that plotting function can plot the right values"""
    
    df = df.copy()
    
    rows = df['row'].unique()
    cols =  df['col'].unique()
    
    col12= []
    for row in rows:
        col12.append(df.loc[(df['row']==row) & (df['col']==12), wav])
        
    rowH = []
    for col in cols:
        rowH.append(df.loc[(df['row']=='H') & (df['col']==col), wav])
    
    # measured values at 12th and H'th row respectively
    bacter, chlamy = np.array(col12), np.array(rowH)
    
    return bacter, chlamy




def plot_pred_vs_known(mdf, column='440', ml_model=False):
    
    """This function takes in the previously calculated prediction values and
    outputs the the predicted vs known values for [chlamy] and [bacter], 
    column refers to the variable wavelength used to construct the regression
    that gave rise to the predicted values"""
    
    df = mdf.copy()

    df = set_B_and_C(df, column)
    
    # figsize
    fs = (8, 10)
    # alpha value for plot markers (see-throughness)
    a = 0.65
    # bacter figure
    fig0 = plt.figure(figsize=fs)
    gs = gridspec.GridSpec(2,1)
    axes = []
    for r in [0,1]:
        for c in [0]:
            axes.append(fig0.add_subplot(gs[r,c]))

    
    # get predicted values from column named
    if ml_model:
        Bp = 'B_pML_'+column
    else:
        Bp = 'B_p'+column
        
    if ml_model:
        Cp = 'C_pML_'+column
    else:
        Cp = 'C_p'+column
        
    # set the scale maximum of the plot to be the maximum value of the maximum
    # of the predicted and known values, which -ever is greater
    smx = max(df[Bp].max(), df['B'].max(), df[Cp].max(), df['C'].max())
    smx = smx+.1*smx
    
    sns.scatterplot(x='B', y=Bp, data=df, hue='row',
                    palette=sns.color_palette("YlOrRd_r", 8),
                    edgecolor='black', alpha=a, ax=axes[0])
    
    handles, labels = axes[0].get_legend_handles_labels()
    axes[0].legend(handles[::-1], labels[:-9:-1], title='[B] gradient')
    axes[0].set_title('Predicted vs. Actual [B]')
    axes[0].set_ylabel('$\hat{[B]}$$_{pred}$')
    axes[0].set_xlabel('$[B]_{known}$')

    axes[0].plot((0,1), 'r--',)
    axes[0].set_xlim(left=-0.02, right=smx) 
    axes[0].set_ylim(bottom=-0.02, top=smx)


    sns.scatterplot(x='B', y=Bp, data=df, hue='col',
                    palette=sns.color_palette("YlGn_r", 12),
                    edgecolor='black', alpha=a, ax=axes[1])

    handles, labels = axes[1].get_legend_handles_labels()
    axes[1].legend(handles[::-1], labels[:-13:-1], title='[C] gradient',
                   loc=2, prop={'size':9})
    axes[1].set_title('Predicted vs. Actual [B]')
    axes[1].set_ylabel('$\hat{[B]}$$_{pred}$')
    axes[1].set_xlabel('$[B]_{known}$')

    axes[1].plot((0,1), 'r--',)
    axes[1].set_xlim(left=-0.02, right=smx)
    axes[1].set_ylim(bottom=-0.02, top=smx)

    if ml_model:
        fig0.suptitle("[Bacteria] Predictions Using ML Multiple Linear Regression @"+column, y=.95)
    else:
        fig0.suptitle("[Bacteria] Predictions Using Simple Linear Regression Models @"+column, y=.95)
        
    fig0.subplots_adjust(hspace=.3)
    
    #plt.show()
    #plt.close()



    # CHLAMY
    fig1=plt.figure(figsize=fs)
    gs = gridspec.GridSpec(2,1)
    axes = []
    for r in [0,1]:
        for c in [0]:
            axes.append(fig1.add_subplot(gs[r,c]))

        
    # set the scale maximum of the plot to be the maximum value of the maximum
    # of the predicted and known values, which -ever is greater
    smx = max(df[Cp].max(), df['C'].max(), df[Bp].max(), df['B'].max())
    smx = smx+.1*smx
    
    
    
    sns.scatterplot(x='C', y=Cp, data=df, hue='row',
                    palette=sns.color_palette("YlOrRd_r", 8),
                    edgecolor='black', alpha=a,ax=axes[0])


    handles, labels = axes[0].get_legend_handles_labels()
    axes[0].legend(handles[::-1], labels[:-9:-1], title='[B] gradient')
    axes[0].set_title('Predicted vs. Actual [C]')
    axes[0].set_ylabel('$\hat{[C]_{pred}}$')
    axes[0].set_xlabel('$[C]_{known}$')

    axes[0].plot((0,1), 'r--',)
    axes[0].set_xlim(left=-0.02, right=smx)
    axes[0].set_ylim(bottom=-0.02, top=smx)


    sns.scatterplot(x='C', y=Cp, data=df, hue='col',
                    palette=sns.color_palette("YlGn_r", 12),
                    edgecolor='black', alpha=a, ax=axes[1])

    handles, labels = axes[1].get_legend_handles_labels()
    axes[1].legend(handles[::-1], labels[:-13:-1], title='[C] gradient',
                  prop={'size':9}, loc=2)
    axes[1].set_title('Predicted vs. Actual [C]')
    axes[1].set_ylabel('$\hat{[C]_{pred}}$')
    axes[1].set_xlabel('$[C]_{known}$')

    axes[1].plot((0,1), 'r--',)
    axes[1].set_xlim(left=-0.02, right=smx)
    axes[1].set_ylim(bottom=-0.02, top=smx)
    
    if ml_model:
        fig1.suptitle("[Chlamy] Predictions Using ML Multiple Linear Regression @"+column, y=.95)
    else:
        fig1.suptitle("[Chlamy] Predictions Using Simple Linear Regression Models @"+column, y=.95)
    
    fig1.subplots_adjust(hspace=.3)

    plt.show()
    plt.close()
    

    

# Plot 'Old' Predicted vs 'Old' True

### Note this DF was constructed with A680 & 720 data 
### as the Regressors

In [None]:
# Load Old Calibration Data & Predicted Fractions with THOSE modles
# Load new models and Predict New

In [9]:
# OLD a680_a750 predictions

path = """/home/rdmtinez/Documents/B-IT MS Program/Masters Thesis/data_o/\
calibration/single_species_calibration/predicted_fractions"""

mdf = pd.read_csv(pjoin(path, 'a680_a750_slr_predicted_fractions.csv'), index_col=0)


In [10]:
mdf.head()

Unnamed: 0_level_0,row,col,B_lbl,C_lbl,F680,A680,A750,320,325,330,...,B_p780,C_p780,B_p785,C_p785,B_p790,C_p790,B_p795,C_p795,B_p800,C_p800
well,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1,A,1,0.2,0.25,4666.0,0.721,0.4146,0.8399,0.8359,0.8349,...,0.17897,0.201764,0.17813,0.200207,0.177417,0.198906,0.17662,0.197484,0.175812,0.196226
B1,B,1,0.175,0.25,4832.0,0.7822,0.4155,0.8916,0.889,0.8907,...,0.130215,0.246447,0.129598,0.244546,0.129077,0.242956,0.128488,0.241221,0.12789,0.239684
C1,C,1,0.15,0.25,4638.0,0.6925,0.3829,0.7923,0.7897,0.7912,...,0.143343,0.206093,0.142668,0.204503,0.142096,0.203173,0.141454,0.201722,0.140802,0.200436
D1,D,1,0.125,0.25,4775.0,0.6981,0.3593,0.769,0.7657,0.7669,...,0.09472,0.229212,0.094268,0.227444,0.093888,0.225966,0.093456,0.224352,0.093016,0.222922
E1,E,1,0.1,0.25,4936.0,0.6788,0.3414,0.744,0.7413,0.7438,...,0.077246,0.229277,0.076874,0.227508,0.076563,0.22603,0.076207,0.224416,0.075844,0.222986


In [12]:
# remove predicted values from mdf
# correct values by subtract H12
# create new models using the NEW script-functions
# predict values
# plot new predicted values and against 'old values'
# IFF the plots DO NOT look the same the new script
# are handling the new data incorrectly, you should
# immediately check the way the models are applying
# the coefficients...............................

nmdf = mdf[[i for i in mdf if 'p' not in i]].copy()

In [17]:
#nmdf.head()
nmdf.tail()

Unnamed: 0_level_0,row,col,B_lbl,C_lbl,F680,A680,A750,320,325,330,...,755,760,765,770,775,780,785,790,795,800
well,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D12,D,12,0.125,0.0,0.0,0.136,0.1237,0.2024,0.2005,0.1981,...,0.1225,0.1221,0.1213,0.1208,0.1202,0.1197,0.1192,0.1187,0.1181,0.1176
E12,E,12,0.1,0.0,1.0,0.1143,0.1053,0.1641,0.1627,0.1606,...,0.0985,0.0981,0.0974,0.0971,0.0965,0.0961,0.0955,0.0951,0.0945,0.0941
F12,F,12,0.05,0.0,1.0,0.0494,0.0418,0.0754,0.0741,0.0726,...,0.0419,0.0418,0.0414,0.0414,0.041,0.0409,0.0406,0.0405,0.0402,0.0401
G12,G,12,0.025,0.0,1.0,0.0235,0.0191,0.033,0.033,0.0323,...,0.0174,0.0173,0.0171,0.0171,0.0168,0.0168,0.0167,0.0166,0.0165,0.0164
H12,H,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# ###### NOTE #####
# # This only checks to see if the new functions are properly written
# # they should generate the same predictions as previously saved data
# # After ensuring the new functions properly work, test new models on old data
# # this only needs to be run once, for checking
# a680_a750 = get_coeff_dict_for_all_regressors(nmdf, FIRST_REGRESSAND='A680', second_regressand='A750')

# regressors = ['500', '510', '520', '530', '540', '550', '560', '570', '580', '590',
#               '600', '610', '620', '630', '640', '650', '660', '670', '680', '690', 
#               '700', '710', '720', '730', '740', '750']


# a680_a750_main_df = nmdf.copy()
# for reg in regressors:
#     tmp = get_predictions(nmdf, a680_a750, regressor=reg, first_regressand='A680', second_regressand='A750')
#     a680_a750_main_df = pd.concat([a680_a750_main_df, tmp], axis=1)

# a680_a750_main_df.head()

Unnamed: 0_level_0,row,col,B_lbl,C_lbl,F680,A680,A750,320,325,330,...,B_p710,C_p710,B_p720,C_p720,B_p730,C_p730,B_p740,C_p740,B_p750,C_p750
well,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1,A,1,0.2,0.25,4666.0,0.721,0.4146,0.8399,0.8359,0.8349,...,0.19034,0.25359,0.18858,0.234805,0.186784,0.224078,0.185022,0.217481,0.183475,0.212442
B1,B,1,0.175,0.25,4832.0,0.7822,0.4155,0.8916,0.889,0.8907,...,0.138668,0.309753,0.137348,0.286806,0.136,0.273702,0.134694,0.265644,0.133548,0.25949
C1,C,1,0.15,0.25,4638.0,0.6925,0.3829,0.7923,0.7897,0.7912,...,0.152531,0.259032,0.151103,0.239843,0.149646,0.228885,0.148224,0.222147,0.146976,0.217
D1,D,1,0.125,0.25,4775.0,0.6981,0.3593,0.769,0.7657,0.7669,...,0.10096,0.288091,0.09998,0.266749,0.098978,0.254561,0.098017,0.247067,0.097173,0.241343
E1,E,1,0.1,0.25,4936.0,0.6788,0.3414,0.744,0.7413,0.7438,...,0.082411,0.288173,0.081595,0.266825,0.080761,0.254633,0.079966,0.247137,0.079269,0.241411


In [None]:
# new functions predictions --- should be the same as old saved data
# plot_pred_vs_known(a680_a750_main_df, '560')
# plot_pred_vs_known(mdf, '560')

## THE NEW Fuctions work as expected !


# The New Scripts are Handling the New DATA JUST FINE!
# Model Old Data with New Models

In [25]:
# LOAD New Models
path = """/home/rdmtinez/Documents/B-IT MS Program/Masters Thesis/data_o/\
calibration/community_calibration/created_models"""
a680_a750 = pd.read_csv(pjoin(path, 'a680_a750_coefficients.csv'), index_col=0)

In [33]:
#get_coeff_dict_for_all_regressors(nmdf, FIRST_REGRESSAND='A680', second_regressand='A750')

In [None]:
for reg in regressors:
    tmp = get_predictions(nmdf, a680_a750, regressor=reg, first_regressand='A680', second_regressand='A750')
    a680_a750_main_df = pd.concat([a680_a750_main_df, tmp], axis=1)

a680_a750_main_df.head()