In [None]:
# Description: Train a ML model to classigy cells based on measurement data
from spacr.ml import generate_ml_scores
%matplotlib inline

settings = {'src':'path', # (path) path to source folder (where origional images were stored)
            'model_type_ml':'xgboost', # (string) - Type of model ( 'random_forest', 'xgboost', 'gradient_boosting')
            'heatmap_feature':'predictions', # (string) - column to display in heatmaps
            'grouping':'mean', # (string) - Grouping for heatmap
            'min_max':'allq', # (string) - Quantiles to normalize heatmap to (all, allq)
            'cmap':'viridis', # (string) - Heatmap cmap
            'n_estimators':100, # (integer) - Number of estimators for model
            'test_size':0.2, # (float) - Fraction of images used for the test set
            'location_column':'column_name', # (string) - Column containing negative/ positive controll metadata information.
            'positive_control':'c2', # (string) - Value for positive control in location column
            'negative_control':'c1', # (string) - Value for negative control in location column
            'exclude':None, # (string, NoneType) - Rows to exclude in location_column
            'nuclei_limit':1, # (integer) - Maximum number of nuclei for each cell
            'pathogen_limit':3, # (integer) - Maximum number of pathogens per cell
            'n_repeats':10, # (integer) - Number of repeats for permutation importance.
            'top_features':30, # (integer) - Number of top features to plot based on permutation importance, feature importance and shap.
            'channel_of_interest':1, # (integer) - 
            'minimum_cell_count':25, # (integer) - Minimum number of cells per well
            'remove_low_variance_features':True, # (bool) - Remove columns with low variance.
            'remove_highly_correlated_features':True, # (bool) - Remove highly correlated features.
            'verbose':False, # (bool) - Display verbose output
            'n_jobs':10} # (integer) - Number of threads

results = generate_ml_scores(settings)

In [None]:
# Description: Fit a regression model to estimate the effect size of gRNAs on cell scores.
from spacr.ml import perform_regression
import pandas as pd
%matplotlib inline

settings = {'count_data':'path', # (path) path or list of paths to sequencing count data
            'score_data':'path', # (path) path or list of paths to score data
            'score_column':'column', # () - column with cell scores
            'metadata_files':['path.csv','path.csv'], # (list) pahts to gene metadata 
            'positive_control':'gene', # (string) - gene to highlight in volcano plot
            'negative_control':'gene', # (string) - gene to highlight in volcano plot
            'min_n':3, # () - 
            'fraction_threshold':None, # (Optional, float or NoneType) - Minimum threshold for gene fraction, if None automatically calculated
            'target_unique_count':5, # () - Number of expected unique gRNAs per well
            'tolerance':0.02, # (float) - Tollerance for cells per well limit
            'log_x':False, # () - gRNA Fraction plot X axis log
            'log_y':False, # () - gRNA Fraction plot Y axis log
            'x_lim':None, # () - Volcano X axis limit
            'control_wells':['c1','c2','c3'], # (list) - Metadata to exclude from regression model
            'filter_column':'column', # (str) - Column containing control metadata to remove
            'dependent_variable': 'column', # (string) - Dependent variable for regression
            'threshold_method':'var', # (string) - effect size thresold type (std or var)
            'threshold_multiplier':4, # (integer) - effect size threshold multiplyer 
            'transform':'log', # (string) - Transform dependent variable
            'agg_type':'mean', # (string) - aggregation for dependent variable
            'min_cell_count':None, # (integer) - Minimum number of cells per well
            'regression_type':'ols', # (string) - Type of regression (ols, glm, mixed, ridge, lasso).
            'random_row_column_effects':False, # (bool) - Remove plate , row and column random effects.
            'y_lims':[[0,9], [12, 16]], # (list of lists) limits for broken y axis
            'plate':None, # (string or NoneType) - strinf to replace plate column values with
            'cov_type':None, # (string) - covariance type for ols regression
            'volcano':'gene', # (string) - mode for significant resuls (gene, grna, all)
            'alpha':0.8} # (float) - alpha for hinge and lasso regression

coef_df = perform_regression(settings)