# SynCom_MLR_Models


# Imports

In [1]:
import numpy as np
import pandas as pd
import os
import scipy

from sklearn import preprocessing

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.gridspec as gridspec

import seaborn as sns
sns.set(style='whitegrid')

import itertools as it

from os.path import join as pjoin

%config InteractiveShell.ast_node_interactivity='all'
%config InlineBackend.figure_format = 'svg'

# Data

In [81]:
path = '/home/rdmtinez/Desktop/MScThesis/data_o/calibration/community_calibration/parsed_data/'
fname = 'community_calibration_dataframe.csv'

mdf = pd.read_csv(pjoin(path,fname), sep=',', index_col='well')

In [82]:
######################################
# correct values used for regression #
######################################
for col in [i for i in mdf.columns[4:].values if 'sdv' not in i]:
    #subtract 'blank' well values
    mdf.loc[:,col] = mdf.loc[:,col] - mdf.loc['H12',col]

In [84]:
mdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96 entries, A1 to H12
Data columns (total 38 columns):
row         96 non-null object
col         96 non-null int64
B_lbl       96 non-null float64
C_lbl       96 non-null float64
A600        96 non-null float64
A600_sdv    96 non-null float64
A680        96 non-null float64
A680_sdv    96 non-null float64
A720        96 non-null float64
A720_sdv    96 non-null float64
A750        96 non-null float64
A750_sdv    96 non-null float64
500         96 non-null float64
510         96 non-null float64
520         96 non-null float64
530         96 non-null float64
540         96 non-null float64
550         96 non-null float64
560         96 non-null float64
570         96 non-null float64
580         96 non-null float64
590         96 non-null float64
600         96 non-null float64
610         96 non-null float64
620         96 non-null float64
630         96 non-null float64
640         96 non-null float64
650         96 non-null float64
660    

# Helper Function

In [96]:
def plot_pred_vs_known(mdf, column='440', ml_model=False):
    
    """This function takes in the previously calculated prediction values and
    outputs the the predicted vs known values for [chlamy] and [bacter], 
    column refers to the variable wavelength used to construct the regression
    that gave rise to the predicted values"""
    
    df = mdf.copy()

    df = set_B_and_C(df, column)
    
    fs = (8, 10)
    
    # bacter figure
    fig0 = plt.figure(figsize=fs)
    gs = gridspec.GridSpec(2,1)
    axes = []
    for r in [0,1]:
        for c in [0]:
            axes.append(fig0.add_subplot(gs[r,c]))

    
    # get predicted values from column named
    if ml_model:
        Bp = 'B_pML_'+column
    else:
        Bp = 'B_p'+column
        
    # set the scale maximum of the plot to be the maximum value of the maximum
    # of the predicted and known values, which -ever is greater
    print(Bp)
    smx = max(df[Bp].max(), df['B'].max())
    smx = smx+.1*smx
    
    sns.scatterplot(x='B', y=Bp, data=df, hue='row',
                    palette=sns.color_palette("YlOrRd_r", 8), ax=axes[0])
    
    handles, labels = axes[0].get_legend_handles_labels()
    axes[0].legend(handles[::-1], labels[:-9:-1], title='[B] gradient')
    axes[0].set_title('Predicted vs. Actual [B]')
    axes[0].set_ylabel('$\hat{[B]}$$_{pred}$')
    axes[0].set_xlabel('$[B]_{known}$')

    axes[0].plot((0,1), 'r--',)
    axes[0].set_xlim(left=-0.01, right=smx) 
    axes[0].set_ylim(bottom=-0.01, top=smx)



    sns.scatterplot(x='B', y=Bp, data=df, hue='col',
                    palette=sns.color_palette("YlGn_r", 12), ax=axes[1])

    handles, labels = axes[1].get_legend_handles_labels()
    axes[1].legend(handles[::-1], labels[:-13:-1], title='[C] gradient',
                   loc=2, prop={'size':9})
    axes[1].set_title('Predicted vs. Actual [B]')
    axes[1].set_ylabel('$\hat{[B]}$$_{pred}$')
    axes[1].set_xlabel('$[B]_{known}$')

    axes[1].plot((0,1), 'r--',)
    axes[1].set_xlim(left=-0.01, right=smx)
    axes[1].set_ylim(bottom=-0.01, top=smx)

    if ml_model:
        fig0.suptitle("[Bacteria] Predictions Using ML Multiple Linear Regression @"+column, y=.95)
    else:
        fig0.suptitle("[Bacteria] Predictions Using Simple Linear Regression Models @"+column, y=.95)
        
    fig0.subplots_adjust(hspace=.3)
    
    #plt.show()
    #plt.close()



    # CHLAMY
    fig1=plt.figure(figsize=fs)
    gs = gridspec.GridSpec(2,1)
    axes = []
    for r in [0,1]:
        for c in [0]:
            axes.append(fig1.add_subplot(gs[r,c]))

    # get predicted values from column named
    if ml_model:
        Cp = 'C_pML_'+column
    else:
        Cp = 'C_p'+column
        
    # set the scale maximum of the plot to be the maximum value of the maximum
    # of the predicted and known values, which -ever is greater
    smx = max(df[Cp].max(), df['C'].max())
    smx = smx+.1*smx
    
    
    
    sns.scatterplot(x='C', y=Cp, data=df, hue='row',
                    palette=sns.color_palette("YlOrRd_r", 8), ax=axes[0])


    handles, labels = axes[0].get_legend_handles_labels()
    axes[0].legend(handles[::-1], labels[:-9:-1], title='[B] gradient')
    axes[0].set_title('Predicted vs. Actual [C]')
    axes[0].set_ylabel('$\hat{[C]_{pred}}$')
    axes[0].set_xlabel('$[C]_{known}$')

    axes[0].plot((0,1), 'r--',)
    axes[0].set_xlim(left=-0.01, right=smx)
    axes[0].set_ylim(bottom=-0.01, top=smx)


    sns.scatterplot(x='C', y=Cp, data=df, hue='col',
                    palette=sns.color_palette("YlGn_r", 12), ax=axes[1])

    handles, labels = axes[1].get_legend_handles_labels()
    axes[1].legend(handles[::-1], labels[:-13:-1], title='[C] gradient',
                  prop={'size':9}, loc=2)
    axes[1].set_title('Predicted vs. Actual [C]')
    axes[1].set_ylabel('$\hat{[C]_{pred}}$')
    axes[1].set_xlabel('$[C]_{known}$')

    axes[1].plot((0,1), 'r--',)
    axes[1].set_xlim(left=-0.01, right=smx)
    axes[1].set_ylim(bottom=-0.01, top=smx)
    
    if ml_model:
        fig1.suptitle("[Chlamy] Predictions Using ML Multiple Linear Regression @"+column, y=.95)
    else:
        fig1.suptitle("[Chlamy] Predictions Using Simple Linear Regression Models @"+column, y=.95)
    
    fig1.subplots_adjust(hspace=.3)

    plt.show()
    plt.close()
    
    
    
def set_B_and_C(tdf, column):
    """This helper function sets the concentraiton values B and
    C on the dataframe for regressiong purposes and so that the 
    plotting function can plot the right values on the x-axis"""
    
    df = tdf.copy()
    
    rows = df['row'].unique()
    cols =  df['col'].unique()
    
    for row in rows:
        print(row)
        print(column)
        df.loc[(df['row']==row) & (df['col']==12), column]
        value = df.loc[(df['row']==row) & (df['col']==12), column][0]
        df.loc[df['row']==row, 'B'] = value

    for col in cols:
        value = df.loc[(df['row']=='H') & (df['col']==col), column][0]
        df.loc[df['col']==col, 'C'] = value
    
    return df


def MLR_fitter(dframe, regressands= ['A680', '720'], regressor='560', metrics=False):
    """This function returns the fitters from a multiple linear regression that
    regress back to [B] and [C], i.e. the returned fitters allow you to make predictions
    """
    
    df=dframe.copy()
    
    X = df[regressands]
    
    yB = set_B_and_C(df,regressor)[['B']]
    yC = set_B_and_C(df,regressor)[['C']]

    XtrnB, XtstB, ytrnB, ytstB = train_test_split(X, yB, test_size=0.20, random_state=7)
    XtrnC, XtstC, ytrnC, ytstC = train_test_split(X, yC, test_size=0.20, random_state=7)

    rB = LinearRegression().fit(XtrnB, ytrnB)
    rC = LinearRegression().fit(XtrnC, ytrnC)
    
    if metrics==True:
        """If metrics true it returns the test variables to check the fits"""
        return rB, rC, XtstB, ytstB, XtstC, ytstC
    
    return rB, rC


def get_MLR_predictions(dframe, regressands=['A680', '720'], regressor='560', regressors_B_C=None):
    """Returns the complete predictions for the regresor as series 
    which can be appended to an existing dataframe. If regressors are passed it ingores
    the regressands and regressor and only returns the predicitons using
    those regressors"""
    
    if regressors_B_C==None:
        """If regressors haven't been created previously with MLR_fitter(), then this
        function first calls that function to assess the predictions over the regressor series"""
        
        regressors_B_C = MLR_fitter(dframe, regressands, regressor)
        
    df = dframe.copy()
    
    df['B_pML_'+regressor] = regressors_B_C[0].predict(df[regressands])
    df['C_pML_'+regressor] = regressors_B_C[1].predict(df[regressands])
    
    return df[['B_pML_'+regressor ,'C_pML_'+regressor]]
    

def get_model_metrics(dframe, regressands=['A680', '720'],regressor='560',regressors_B_C=None):
    
    df = dframe.copy()
    
    if regressors_B_C==None:
        """This prints out goodness-of-fit for any regressor and regressor for the
        MODELS and TEST DATA only and returns the models as well so that you
        can use them to make the predictions.
        """
        rB, rC, XtstB, ytstB, XtstC, ytstC = MLR_fitter(df, regressands,
                                                  regressor, metrics=True)
        
        
        ypB = rB.predict(XtstB)
        ypC = rC.predict(XtstC)
        
        print('Bp_'+regressor+'_MSE', mse(ytstB, ypB))
        print('Bp_'+regressor+'_R^2', r2s(ytstB, ypB))
        
        print('Cp_'+regressor+'_MSE', mse(ytstC, ypC))
        print('Cp_'+regressor+'_R^2', r2s(ytstC, ypC))
        
        return rB, rC
    
    elif regressors_B_C:
        
        """This prints out the good-of-fit values for all of the predictions against
        the actual regressor values, to double-check these values run the rB rC predictors
        on the entirity of the [regressors] wavelength"""
        
        df = set_B_and_C(dframe, regressor)
        
        ypB = regressors_B_C[0].predict(df[regressands])
        ypC = regressors_B_C[1].predict(df[regressands])       
        
        print('Bp_'+regressor+'_MSE', mse(df[['B']], ypB))
        print('Bp_'+regressor+'_R^2',r2s(df[['B']], ypB))
        
        print('Cp_'+regressor+'_MSE',mse(df[['C']], ypC))
        print('Cp_'+regressor+'_R^2',r2s(df[['C']], ypC))


def get_model_coefficients(dframe, regressands, regressor):
    
    rB, rC = MLR_fitter(dframe, regressands, regressor=regressor)
    
    return rB.coef_[0][0], rB.coef_[0][1], rC.coef_[0][0], rC.coef_[0][1]


def get_coeff_dict_for_all_regressors_ML(df, regressands=['A680', 'A720']):
    """
    This function calls the get_regression_coefficient function and builds models which regress back
    to ALL the regressor wavelength 500 to 750. Pay careful attention to which regressands were used.
    Ensure to name your dictionary accordingly, lest you forget what your models are using.
    """
    
    
    regressors = ['A600', '500', '510', '520',
              '530', '540', '550', '560',
              '570', '580', '590', '600',
              '610', '620', '630', '640',
              '650', '660', '670', '680',
              '690', '700', '710', '720',
               '730', '740', '750']
    
    # stores the coefficients which regress back to each regressor
    coefficients = {}
    
    for regr in regressors:
        coefficients[regr] = get_model_coefficients(df, regressands, regr)
                
    return coefficients


# Parameter Learning

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse, r2_score as r2s
import statsmodels.api as sm

In [98]:
#

In [99]:
lambdas = [i for i in mdf.columns.values[6:] if 'sdv' not in i][3:]

tdf = mdf.copy()
for wave in lambdas:
    tdf = pd.merge(tdf, get_MLR_predictions(tdf, regressands=['A680', 'A720'],
                                          regressor=wave), left_index=True, right_index=True, )

In [100]:
# save_to = """/home/rdmtinez/Documents/B-IT MS Program/Masters Thesis/data_o/calibration/community_calibration/"""
# tdf.to_csv(pjoin(save_to, 'a680_a720__ML_pred_fractions.csv'))

In [101]:
lambdas = [i for i in mdf.columns.values[6:] if 'sdv' not in i][3:]

tdf = mdf.copy()
for wave in lambdas:
    tdf = pd.merge(tdf, get_MLR_predictions(tdf, regressands=['A680', 'A750'],
                                          regressor=wave), left_index=True, right_index=True, )

In [102]:
# save_to = """/home/rdmtinez/Documents/B-IT MS Program/Masters Thesis/data_o/calibration/community_calibration/"""
# tdf.to_csv(pjoin(save_to, 'a680_a750__ML_pred_fractions.csv'))

In [14]:
lambdas = [i for i in mdf.columns.values[6:] if 'sdv' not in i][3:]

tdf = mdf.copy()
for wave in lambdas:
    tdf = pd.merge(tdf, get_MLR_predictions(tdf, regressands=['A680', '730'],
                                          regressor=wave), left_index=True, right_index=True, )

In [15]:
# save_to = """/home/rdmtinez/Documents/B-IT MS Program/Masters Thesis/data_o/calibration/community_calibration/"""
# tdf.to_csv(pjoin(save_to, 'a680_730__ML_pred_fractions.csv'))

# Get & Save All Model Coefficients

In [38]:
a680_a720_ml = get_coeff_dict_for_all_regressors_ML(mdf, regressands= ['A680', 'A720'])
a680_730_ml = get_coeff_dict_for_all_regressors_ML(mdf, regressands= ['A680', '730'])
a680_a750_ml = get_coeff_dict_for_all_regressors_ML(mdf, regressands= ['A680', 'A750'])
                                                    


In [57]:

tdf = pd.DataFrame.from_dict(data=a680_a720_ml,
                             orient='index',
                             columns=['kB680m', 'kB720m', 'kC680m', 'kC720m'])
tdf.to_csv(pjoin(save_to,'a680_a720_coefficients.csv'))
tdf.head()

tdf = pd.DataFrame.from_dict(data=a680_730_ml,
                             orient='index',
                             columns=['kB680m', 'kB730m', 'kC680m', 'kC730m'])
tdf.to_csv(pjoin(save_to,'a680_730_coefficients.csv'))
tdf.head()

tdf = pd.DataFrame.from_dict(data=a680_a750_ml,
                             orient='index',
                             columns=['kB680m', 'kB750m', 'kC680m', 'kC750m'])
tdf.to_csv(pjoin(save_to,'a680_a750_coefficients.csv'))
tdf.head()

Unnamed: 0,kB680m,kB720m,kC680m,kC720m
A600,-1.39752,2.975766,1.359733,-1.496666
500,-1.661957,3.538012,2.21469,-2.452154
510,-1.625643,3.461147,1.927548,-2.131637
520,-1.593268,3.39239,1.654851,-1.828232
530,-1.562181,3.326458,1.502302,-1.65893


Unnamed: 0,kB680m,kB730m,kC680m,kC730m
A600,-1.449364,3.047585,1.386647,-1.534414
500,-1.724864,3.625858,2.242803,-2.483005
510,-1.686963,3.546658,1.953503,-2.161398
520,-1.653399,3.476259,1.67821,-1.855886
530,-1.621039,3.408495,1.523757,-1.684525


Unnamed: 0,kB680m,kB750m,kC680m,kC750m
A600,-1.347691,3.034884,1.332707,-1.522341
500,-1.6019,3.606618,2.170015,-2.493405
510,-1.566935,3.528356,1.888656,-2.167382
520,-1.535753,3.45832,1.621708,-1.859329
530,-1.505821,3.391183,1.47222,-1.68713


# Build Combined Data ML Models

In [58]:
# Load old data and use the new model to predict the old measurements
# compare against the the sum of the predicted values against the measured 560 values

In [103]:
path = '/home/rdmtinez/Desktop/MScThesis/data_o/calibration/single_species_calibration/parsed_data'
fname = 'single_strain_calibration_dataframe.csv'

ocdf = pd.read_csv(pjoin(path,fname))
#ocdf.info(verbose=True)

# since we would like to use 720 signal to create our new models we rename
# just this column so that it maches the A720 signal from the new syncom cal. data
# thus avoid NAs
ocdf= ocdf.rename({'720':'A720'}, axis=1)

path = '/home/rdmtinez/Desktop/MScThesis/data_o/calibration/community_calibration/parsed_data'
fname = 'community_calibration_dataframe.csv'

ccdf = pd.read_csv(pjoin(path,fname))
#ccdf.info()

In [95]:
cdfs = [ocdf,ccdf]

for wave in lambdas:

    
    set_B_and_C(ccdf[i], column='560')
    
# here we concatenate the old and new dataframes by columns since we're going to use the entirity of
# this dataset to create new model it doesn't make any sense to keep columns which produce NA's during
# concatenation NA's result from columns in the new syncom dataframe that has to corresponding match
# in the old single strain calibration data

cmb = pd.concat(cdfs, sort=False).reset_index(drop=True).dropna(axis=1, how='any')

In [80]:
lambdas = [i for i in mdf.columns.values[6:] if 'sdv' not in i][3:]

tdf = cmb.copy()
for wave in lambdas:
    tdf = pd.merge(tdf, get_MLR_predictions(cmb, regressands=['A680', 'A720'],
                                          regressor=wave), left_index=True, right_index=True, )

A
500


KeyError: 0

In [None]:
# after predicting old values with new model create the cell count models